From 4d66434c80050ace3b927f3fc1829b8dd377f78a Mon Sep 17 00:00:00 2001 From: HSILA Date: Fri, 24 Jan 2025 22:50:50 -0500 Subject: [PATCH 001/275] feat: Integrating ChemTEB (#1708) * Add SMILES, AI Paraphrase and Inter-Source Paragraphs PairClassification Tasks * Add chemical subsets of NQ and HotpotQA datasets as Retrieval tasks * Add PubChem Synonyms PairClassification task * Update task __init__ for previously added tasks * Add nomic-bert loader * Add a script to run the evaluation pipeline for chemical-related tasks * Add 15 Wikipedia article classification tasks * Add PairClassification and BitextMining tasks for Coconut SMILES * Fix naming of some Classification and PairClassification tasks * Fix some classification tasks naming issues * Integrate WANDB with benchmarking script * Update .gitignore * Fix `nomic_models.py` issue with retrieval tasks, similar to issue #1115 in original repo * Add one chemical model and some SentenceTransformer models * Fix a naming issue for SentenceTransformer models * Add OpenAI, bge-m3 and matscibert models * Add PubChem SMILES Bitext Mining tasks * Change metric namings to be more descriptive * Add English e5 and bge v1 models, all the sizes * Add two Wikipedia Clustering tasks * Add a try-except in evaluation script to skip faulty models during the benchmark. * Add bge v1.5 models and clustering score extraction to json parser * Add Amazon Titan embedding models * Add Cohere Bedrock models * Add two SDS Classification tasks * Add SDS Classification tasks to classification init and chem_eval * Add a retrieval dataset, update dataset names and revisions * Update revision for the CoconutRetrieval dataset: handle duplicate SMILES (documents) * Update `CoconutSMILES2FormulaPC` task * Change CoconutRetrieval dataset to a smaller one * Update some models - Integrate models added in ChemTEB (such as amazon, cohere bedrock and nomic bert) with latest modeling format in mteb. - Update the metadata for the mentioned models * Fix a typo `open_weights` argument is repeated twice * Update ChemTEB tasks - Rename some tasks for better readability. - Merge some BitextMining and PairClassification tasks into a single task with subsets (`PubChemSMILESBitextMining` and `PubChemSMILESPC`) - Add a new multilingual task (`PubChemWikiPairClassification`) consisting of 12 languages. - Update dataset paths, revisions and metadata for most tasks. - Add a `Chemistry` domain to `TaskMetadata` * Remove unnecessary files and tasks for MTEB * Update some ChemTEB tasks - Move `PubChemSMILESBitextMining` to `eng` folder - Add citations for tasks involving SDS, NQ, Hotpot, PubChem data - Update Clustering tasks `category` - Change `main_score` for `PubChemAISentenceParaphrasePC` * Create ChemTEB benchmark * Remove `CoconutRetrieval` * Update tasks and benchmarks tables with ChemTEB * Mention ChemTEB in readme * Fix some issues, update task metadata, lint - `eval_langs` fixed - Dataset path was fixed for two datasets - Metadata was completed for all tasks, mainly following fields: `date`, `task_subtypes`, `dialect`, `sample_creation` - ruff lint - rename `nomic_bert_models.py` to `nomic_bert_model.py` and update it. * Remove `nomic_bert_model.py` as it is now compatible with SentenceTransformer. * Remove `WikipediaAIParagraphsParaphrasePC` task due to being trivial. * Merge `amazon_models` and `cohere_bedrock_models.py` into `bedrock_models.py` * Remove unnecessary `load_data` for some tasks. * Update `bedrock_models.py`, `openai_models.py` and two dataset revisions - Text should be truncated for amazon text embedding models. - `text-embedding-ada-002` returns null embeddings for some inputs with 8192 tokens. - Two datasets are updated, dropping very long samples (len > 99th percentile) * Add a layer of dynamic truncation for amazon models in `bedrock_models.py` * Replace `metadata_dict` with `self.metadata` in `PubChemSMILESPC.py` * fix model meta for bedrock models * Add reference comment to original Cohere API implementation --- README.md | 1 + docs/benchmarks.md | 33 ++- docs/tasks.md | 55 +++- mteb/abstasks/TaskMetadata.py | 1 + mteb/benchmarks/benchmarks.py | 44 +++ mteb/models/bedrock_models.py | 264 ++++++++++++++++++ mteb/models/overview.py | 2 + mteb/tasks/BitextMining/__init__.py | 1 + .../eng/PubChemSMILESBitextMining.py | 68 +++++ mteb/tasks/BitextMining/eng/__init__.py | 0 mteb/tasks/Classification/__init__.py | 17 ++ .../eng/SDSEyeProtectionClassification.py | 44 +++ .../eng/SDSGlovesClassification.py | 44 +++ .../eng/WikipediaBioMetChemClassification.py | 37 +++ .../WikipediaBiolumNeurochemClassification.py | 37 +++ ...kipediaChemEngSpecialtiesClassification.py | 37 +++ .../eng/WikipediaChemFieldsClassification.py | 37 +++ .../WikipediaChemistryTopicsClassification.py | 37 +++ ...pediaCompChemSpectroscopyClassification.py | 37 +++ ...ediaCryobiologySeparationClassification.py | 37 +++ ...CrystallographyAnalyticalClassification.py | 37 +++ ...ediaGreenhouseEnantiopureClassification.py | 37 +++ .../WikipediaIsotopesFissionClassification.py | 37 +++ .../WikipediaLuminescenceClassification.py | 37 +++ ...WikipediaOrganicInorganicClassification.py | 37 +++ ...ipediaSaltsSemiconductorsClassification.py | 37 +++ ...ipediaSolidStateColloidalClassification.py | 37 +++ ...kipediaTheoreticalAppliedClassification.py | 37 +++ mteb/tasks/Clustering/__init__.py | 2 + ...WikipediaChemistrySpecialtiesClustering.py | 37 +++ .../eng/WikipediaChemistryTopicsClustering.py | 37 +++ mteb/tasks/PairClassification/__init__.py | 5 + .../eng/PubChemAISentenceParaphrasePC.py | 60 ++++ .../PairClassification/eng/PubChemSMILESPC.py | 128 +++++++++ .../eng/PubChemSynonymPC.py | 61 ++++ .../eng/PubChemWikiParagraphsPC.py | 60 ++++ .../PubChemWikiPairClassification.py | 77 +++++ mteb/tasks/Retrieval/__init__.py | 2 + .../Retrieval/eng/ChemHotpotQARetrieval.py | 60 ++++ mteb/tasks/Retrieval/eng/ChemNQRetrieval.py | 45 +++ 40 files changed, 1678 insertions(+), 25 deletions(-) create mode 100644 mteb/models/bedrock_models.py create mode 100644 mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py create mode 100644 mteb/tasks/BitextMining/eng/__init__.py create mode 100644 mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py create mode 100644 mteb/tasks/Classification/eng/SDSGlovesClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py create mode 100644 mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py create mode 100644 mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py create mode 100644 mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSMILESPC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemSynonymPC.py create mode 100644 mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py create mode 100644 mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py create mode 100644 mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py create mode 100644 mteb/tasks/Retrieval/eng/ChemNQRetrieval.py diff --git a/README.md b/README.md index f556cad894..daf715f029 100644 --- a/README.md +++ b/README.md @@ -517,5 +517,6 @@ You may also want to read and cite the amazing work that has extended MTEB & int - Orion Weller, Benjamin Chang, Sean MacAvaney, Kyle Lo, Arman Cohan, Benjamin Van Durme, Dawn Lawrie, Luca Soldaini. "[FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions](https://arxiv.org/abs/2403.15246)" arXiv 2024 - Dawei Zhu, Liang Wang, Nan Yang, Yifan Song, Wenhao Wu, Furu Wei, Sujian Li. "[LongEmbed: Extending Embedding Models for Long Context Retrieval](https://arxiv.org/abs/2404.12096)" arXiv 2024 - Kenneth Enevoldsen, Márton Kardos, Niklas Muennighoff, Kristoffer Laigaard Nielbo. "[The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding](https://arxiv.org/abs/2406.02396)" arXiv 2024 +- Ali Shiraee Kasmaee, Mohammad Khodadad, Mohammad Arshi Saloot, Nick Sherck, Stephen Dokas, Hamidreza Mahyar, Soheila Samiee. "[ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance & Efficiency on a Specific Domain](https://arxiv.org/abs/2412.00532)" arXiv 2024 For works that have used MTEB for benchmarking, you can find them on the [leaderboard](https://huggingface.co/spaces/mteb/leaderboard). diff --git a/docs/benchmarks.md b/docs/benchmarks.md index a5abe50215..7c0f07d878 100644 --- a/docs/benchmarks.md +++ b/docs/benchmarks.md @@ -7,16 +7,27 @@ The following table gives you an overview of the benchmarks in MTEB. | Name | # Tasks | Task Types | Domains | Languages | |------|---------|------------|---------|-----------| -| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | python,c++,sql,go,eng,php,javascript,ruby,java | -| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Written, Social, Reviews] | sun,kaz,tzl,ido,abs,arq,yue,tam,nij,glg,slk,hsb,ber,xho,cbk,pol,uzb,ina,kab,swh,amh,fao,kzj,lfn,uig,sqi,deu,ang,ind,bug,pms,ibo,cym,eus,spa,ceb,tgl,ron,isl,ita,csb,cha,fin,est,pes,jpn,tel,tha,oci,cmn,min,fry,bbc,epo,lit,rus,bos,hrv,war,ara,bjn,mkd,srp,ast,nno,urd,pam,aze,eng,ace,bew,kor,dan,awa,mui,hye,ban,cor,ben,gle,swe,mad,bul,lat,cat,nob,fra,pcm,ell,mar,vie,tat,ukr,gsw,kat,arz,dsb,lvs,nld,tur,bel,max,nds,afr,khm,dtp,yor,ces,gla,zsm,mak,ile,nov,orv,bre,swg,rej,mhr,mon,mal,jav,heb,slv,bhp,kur,wuu,tuk,por,hun,hin,hau,yid | +| [BRIGHT](https://brightbenchmark.github.io/) | 1 | {'Retrieval': 1} | [Non-fiction] | eng | +| [ChemTEB](https://arxiv.org/abs/2412.00532) | 27 | {'BitextMining': 1, 'Classification': 17, 'Clustering': 2, 'PairClassification': 5, 'Retrieval': 2} | [Chemistry] | nld,tur,eng,ces,kor,zho,spa,hin,jpn,deu,fra,msa,por | +| [CoIR](https://github.com/CoIR-team/coir) | 10 | {'Retrieval': 10} | [Written, Programming] | javascript,ruby,sql,go,eng,java,php,python,c++ | +| [LongEmbed](https://arxiv.org/abs/2404.12096v2) | 6 | {'Retrieval': 6} | [Fiction, Academic, Written, Blog, Non-fiction, Spoken, Encyclopaedic] | eng | +| [MINERSBitextMining](https://arxiv.org/pdf/2406.07424) | 7 | {'BitextMining': 7} | [Reviews, Written, Social] | sqi,ban,srp,jpn,nds,lat,por,mon,kur,bul,slv,mak,deu,uzb,yor,kzj,max,kat,cha,yid,zsm,spa,pms,mhr,min,fao,heb,nij,mui,tuk,rus,bew,swe,pes,slk,ceb,bjn,ido,abs,ukr,ina,kab,tgl,cor,dan,kaz,fry,rej,hrv,ces,lfn,glg,dsb,hau,ace,urd,ben,yue,nld,eng,epo,ron,xho,wuu,cmn,ind,ang,hsb,mad,pam,nov,swh,bbc,pcm,ara,hye,mkd,nno,ast,jav,lvs,mal,swg,nob,tat,arz,vie,ile,tam,est,ber,bre,csb,pol,afr,cbk,bug,tzl,kor,ibo,hun,war,aze,tha,mar,uig,gla,orv,hin,amh,bel,sun,fin,cat,awa,gsw,isl,oci,ell,cym,arq,ita,fra,bos,dtp,eus,bhp,tel,tur,khm,lit,gle | +| MTEB(Europe, beta) | 74 | {'BitextMining': 7, 'Classification': 21, 'Clustering': 8, 'Retrieval': 15, 'InstructionRetrieval': 3, 'MultilabelClassification': 2, 'PairClassification': 6, 'Reranking': 3, 'STS': 9} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, News, Government, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,box,rus,chz,emp,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,hrv,nak,plu,nde,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,ngu,cpa,tif,fuv,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,cpc,quc,hvn,bvr,agu,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,pbt,mmo,amo,kgp,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,ign,vmy,uri,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,knv,heg,urw,ayr,ons,sat,crx,rop,szl,suz,ncl,anh,kto,tca,chk,xla,qxh,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,tbf,ubr,cth,taj,aby,kde,mqj,zao,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,kmb,cof,tso,ipi,byr,aia,wiv,agm,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,bzd,shj,ukr,mbl,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,fas,cbr,bjr,glv,mdy,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,tzo,arb,mos,kam,cuc,dgc,pah,pjt,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,eko,zia,mcp,gah,urt,sua,cab,quh,srm,vid,blz,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | +| MTEB(Indic, beta) | 23 | {'BitextMining': 4, 'Clustering': 1, 'Classification': 13, 'STS': 1, 'PairClassification': 1, 'Retrieval': 2, 'Reranking': 1} | [Web, Fiction, Social, Encyclopaedic, Religious, Written, Constructed, Non-fiction, Legal, News, Spoken, Reviews, Government] | ban,pag,ckb,ydd,srp,azj,jpn,bho,por,sna,als,scn,cjk,zho,mwr,bul,slv,deu,yor,bak,ory,aka,bam,kat,lua,kan,dzo,mya,zsm,spa,shn,min,nus,fao,heb,kac,lug,tuk,kea,rus,ssw,tum,swe,nso,pes,slk,som,mup,pbt,nya,ceb,bjn,kmr,apc,taq,ukr,kab,luo,tgl,dik,dan,kaz,kbp,hrv,ces,glg,ary,hau,ace,urd,ben,boy,ewe,ilo,yue,lin,nld,eng,hne,epo,kir,grn,ron,xho,smo,fur,knc,cmn,ind,ayr,sat,szl,pap,fas,kmb,tso,ltz,swh,brx,zul,azb,doi,ara,hye,mkd,nno,ast,jav,lvs,mal,lao,sot,wol,nob,ltg,tat,san,arz,lmo,vec,nor,vie,sag,khk,arb,mos,kam,tam,bgc,mai,gbm,srd,est,twi,crh,sin,nep,swa,umb,bod,pol,lim,nqo,afr,bug,kor,ibo,mri,hun,aeb,war,ars,mni,fon,tha,mar,tpi,tzm,acq,pan,uzn,kik,gla,uig,hin,lij,tgk,amh,bel,sun,acm,guj,fin,cat,awa,fij,npi,run,tsn,kas,tir,isl,asm,mlt,ell,oci,mag,cym,pus,gom,quy,ajp,raj,fuv,ita,kin,bos,fra,gaz,eus,tel,tur,snd,kon,khm,bem,dyu,gle,hat,lit,prs,lus,plt | +| MTEB(Medical) | 12 | {'Retrieval': 9, 'Clustering': 2, 'Reranking': 1} | [Web, Academic, Medical, Written, Non-fiction, Government] | rus,eng,kor,ara,spa,zho,vie,fra,pol,cmn | +| MTEB(Multilingual, beta) | 132 | {'BitextMining': 13, 'Classification': 43, 'Clustering': 17, 'Retrieval': 18, 'InstructionRetrieval': 3, 'MultilabelClassification': 5, 'PairClassification': 11, 'Reranking': 6, 'STS': 16} | [Web, Fiction, Social, Academic, Religious, Written, Medical, Blog, Constructed, Non-fiction, Legal, Government, News, Reviews, Spoken, Encyclopaedic, Programming, Subtitles] | qvm,esk,nlg,toj,gup,llg,jpn,azj,for,lav,kmh,por,bsj,tna,upv,cta,smk,zty,qvz,ntj,ton,uvh,cjk,kgf,gaw,bak,seh,jiv,hui,ksr,uli,kwi,qvw,kkl,arl,msk,omw,aai,tet,yby,mva,fao,kgk,min,kac,dji,mui,box,rus,chz,emp,bew,ktm,bps,bon,nus,bss,cut,sue,meq,kpr,rwo,ceb,zaj,mib,aui,apc,kdl,mxb,okv,rai,big,reg,ulk,mlg,yap,tpt,rej,hrv,nak,plu,nde,lfn,kyc,arp,hau,ary,alp,apr,caa,mbh,uvl,zat,bjp,urd,bki,lin,mek,hlt,iws,spl,xav,yml,lcm,ese,xho,are,mux,lww,ndg,ntu,tzj,ame,yss,zar,fil,aii,csy,gvs,zpm,amh,spp,ken,avt,ltz,swh,viv,kmk,zul,bqp,cav,wln,leu,tcs,tuf,mkd,clu,msy,too,ast,amx,quf,jav,yre,nhe,tat,lbk,maj,msm,rug,nor,tbc,prf,pad,zlm,kze,wnc,fai,cbs,mai,aoi,mxq,bao,kos,mlh,nep,mkl,roo,umb,poh,bod,nna,aey,afr,aly,cac,maa,aze,fon,tha,mhl,chd,tpi,tzm,acq,kyz,nbq,yle,ape,bco,att,nin,mkj,yuj,ata,djr,atb,enq,cpb,sxb,rmc,zas,guj,kbq,gfk,tgo,acm,cux,fin,npi,etr,tsn,dob,mpt,alq,byx,cak,cso,spy,oci,asm,ttc,nwi,srn,hmn,gyr,hto,arq,ngu,cpa,tif,fuv,raj,kue,yuw,ote,mgw,ssg,bos,mvn,dop,aso,mox,ndj,stp,mpp,nas,kon,mks,caf,mbs,mcd,wap,cco,tod,aon,aom,cnl,srp,zga,lat,sja,kpj,nhi,nko,swp,bho,blw,mih,mon,sna,bgs,als,kyf,kur,bul,uzb,knj,mam,yor,zos,gdr,aka,bam,bmh,gnw,lid,cha,msc,zpl,gun,qxn,zsm,spa,mgh,nca,yid,pms,mhr,cpc,quc,hvn,bvr,agu,svk,ngp,aak,jni,mau,sab,wos,huv,swe,kea,tum,pes,som,mup,pbt,mmo,amo,kgp,ido,taq,sbe,mil,nhg,bmu,bvd,wrs,atg,muy,tpa,chv,ign,vmy,cor,uri,fry,chf,cek,knf,pib,soy,boa,ces,xed,pma,hix,kbc,orm,sim,ace,nhw,kud,ppo,xnn,yut,boy,snx,ilo,zaa,nld,bsp,aau,myk,grn,bkq,cme,bbb,ssd,fur,knc,wuu,knv,heg,urw,ayr,ons,sat,crx,ang,hsb,rop,szl,suz,mad,ncl,anh,kto,tca,chk,xla,qxh,brx,ziw,ntp,azb,ara,tew,sot,cjv,djk,usa,ltg,cap,arz,lmo,vec,jao,wer,dhg,vie,ded,hop,khk,faa,tam,bgc,sus,mwc,ikk,kek,mie,trc,tue,ura,crh,ber,bkd,bzj,kwj,klt,sps,jid,xsi,swa,qxo,csb,lim,nqo,hns,tmd,mbt,mbc,ibo,hun,wrk,bnp,abt,kaq,car,kiz,nvm,nfa,gul,guo,uzn,beo,aer,nhy,otm,orv,cjo,tgk,bel,eri,mca,wsk,rro,row,bsn,tpz,fij,tvk,msb,mpx,abx,poy,sgb,kas,tcz,top,dif,awk,cbc,bea,ell,myy,pus,bmr,ssx,pao,ebk,ajp,opm,wnu,gub,acr,max,tbf,ubr,cth,taj,aby,kde,mqj,zao,tyv,khm,hat,gle,azg,cbv,ian,apu,ptp,kbm,met,plt,sag,agd,sah,pag,ydd,ckb,mzz,div,kmg,miz,tac,tuo,gvn,boj,tee,mph,mna,qwh,gng,agg,mle,mak,rgu,haw,med,kyg,mig,nhu,tnc,waj,kat,lua,zpz,kpx,tof,ven,dzo,yaa,bqc,klv,qul,kqw,bef,gai,heb,nuy,zac,mcr,zpc,ssw,meu,tuk,gui,kmo,usp,otq,khs,ksj,xbi,nya,cya,aoj,kmr,grc,sny,snp,mir,piu,geb,tgl,dik,agn,dan,qvn,kaz,kbp,mto,tiy,xon,zav,dww,zap,kqa,lac,kne,wat,cbt,naf,inb,kwf,crn,azz,wim,ben,wro,poi,yue,awb,cgc,eng,mjc,amf,mps,mwe,ncu,cle,tdt,hne,zai,gdn,toc,bhl,kir,ron,fue,kyq,ixl,ghs,ncj,tbz,nnq,mio,kwd,mxp,beu,sbk,fuh,gym,ztq,mey,ikw,pab,pam,kmb,cof,tso,ipi,byr,aia,wiv,pcm,agm,doi,npl,ter,hye,iou,tku,nno,cnt,kqc,sll,lvs,gnn,nob,dah,nii,san,wuv,udu,gux,ots,zpq,cuk,mbj,nab,bjz,hbo,imo,mcf,glk,zam,twi,srd,sin,zca,qvc,agr,con,kjs,zaw,mav,gum,dov,ood,soq,tte,msa,chq,cbk,tzl,isn,kpf,ptu,mri,cao,aeb,cni,aaz,yon,pan,sgz,rom,mop,gwi,nou,uig,gla,far,atd,hin,tnp,bbr,kpg,huu,arn,jvn,cat,awa,amm,urb,run,mit,pir,gam,adz,tir,isl,pls,mlt,gsw,qve,nyu,txu,tbg,dwy,quy,ruf,kiw,shp,amr,ita,maq,dgr,fra,kin,ubu,gof,gaz,mgc,cmo,ctu,tel,eus,mcq,bpr,ino,snd,bgt,mwf,acu,jic,kkc,jac,lit,xtd,dyu,kvn,zyp,prs,cop,auc,wed,apb,sqi,ban,wal,poe,tnk,myu,otn,kje,ong,bkx,zsr,nds,hch,agt,wiu,spm,zpu,scn,sri,myw,buk,kdc,zho,sbs,slv,mwr,deu,kqf,kvg,tgp,bhg,dwr,xtm,amu,wbp,tim,ory,kzj,tos,kan,kbh,mya,mwp,mcb,shn,bdd,cub,yrb,tbo,yal,nij,lug,tah,txq,emi,hub,nso,slk,zpo,zpv,bmk,nss,bjn,nch,abs,bzd,shj,ukr,mbl,ina,tlf,kab,kew,kpw,luo,cpy,kmu,kup,zab,pri,snc,wbi,acf,gmv,glg,dsb,amp,qup,nop,srq,yka,apw,mqb,wmt,bch,ewe,sey,lbb,epo,qvh,taw,fuc,kql,ksd,smo,gvf,cmn,yad,ind,qvs,obo,wmw,nsn,anv,mic,pap,ake,nov,fas,cbr,bjr,glv,mdy,bbc,tsw,gvc,noa,bus,bjv,cwe,pon,pio,snn,swg,mal,nho,bba,jae,mxt,wol,nif,ycn,lao,tfr,ffm,qub,hus,bzh,mlp,mti,not,nys,ile,tzo,arb,mos,kam,cuc,dgc,pah,pjt,gbm,est,bxh,hot,bre,kms,cot,awx,bjk,pwg,cpu,hla,mpm,fuf,pol,tnn,shi,auy,mpj,tuc,bug,kor,zad,war,ars,rkb,mni,cbu,lif,mar,krc,dad,mee,dgz,mco,kik,apz,mkn,sco,mbb,maz,lij,khz,hmo,guh,sun,cbi,lgl,nhr,tiw,daa,amn,amk,tke,lex,mag,cym,gom,eko,zia,mcp,gah,urt,sua,cab,quh,srm,dtp,vid,blz,bhp,mmx,apn,tur,rmy,bem,yaq,ctp,cui,lus,tav,cax,yva | | [MTEB(Retrieval w/Instructions)](https://arxiv.org/abs/2403.15246) | 3 | {'InstructionRetrieval': 3} | [Written, News] | eng | -| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Encyclopaedic, Spoken, Non-fiction, Government, News, Fiction, Social, Blog, Reviews, Written, Web, Legal] | nob,fao,swe,isl,dan,nno | -| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | python,c++,sql,c,go,eng,shell,typescript,php,scala,rust,swift,javascript,ruby,java | -| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | eng,deu,pol,fra | -| MTEB(eng) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, Blog, News, Medical, Social, Programming, Written, Reviews, Web, Academic] | tur,fra,eng,cmn,pol,ita,nld,spa,deu,ara | -| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Encyclopaedic, Spoken, Non-fiction, News, Social, Reviews, Written, Web, Legal, Academic] | eng,deu,pol,fra | -| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Encyclopaedic, Spoken, News, Reviews, Written, Web] | kor | -| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | eng,deu,zho | -| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Spoken, Non-fiction, News, Fiction, Social, Written, Web, Legal, Academic] | pol,deu,eng,fra | -| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Encyclopaedic, Spoken, Blog, News, Social, Reviews, Written, Web, Academic] | rus | +| [MTEB(Scandinavian)](https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/) | 28 | {'BitextMining': 2, 'Classification': 13, 'Retrieval': 7, 'Clustering': 6} | [Web, Fiction, Social, Written, Blog, Non-fiction, Legal, News, Spoken, Reviews, Government, Encyclopaedic] | swe,nno,isl,dan,fao,nob | +| MTEB(code) | 12 | {'Retrieval': 12} | [Written, Programming] | javascript,ruby,sql,go,c,eng,shell,typescript,rust,java,php,python,scala,swift,c++ | +| [MTEB(deu)](https://arxiv.org/html/2401.02709v1) | 19 | {'Classification': 6, 'Clustering': 4, 'PairClassification': 2, 'Reranking': 1, 'Retrieval': 4, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| MTEB(eng, beta) | 41 | {'Classification': 8, 'Retrieval': 10, 'Clustering': 8, 'Reranking': 2, 'STS': 9, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| MTEB(eng, classic) | 67 | {'Classification': 12, 'Retrieval': 26, 'Clustering': 11, 'Reranking': 4, 'STS': 10, 'PairClassification': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Medical, Blog, Non-fiction, News, Spoken, Reviews, Encyclopaedic, Programming] | nld,tur,eng,ara,spa,ita,deu,fra,pol,cmn | +| [MTEB(fra)](https://arxiv.org/abs/2405.20468) | 26 | {'Classification': 6, 'Clustering': 7, 'PairClassification': 2, 'Reranking': 2, 'Retrieval': 5, 'STS': 3, 'Summarization': 1} | [Web, Academic, Social, Written, Non-fiction, Legal, News, Spoken, Reviews, Encyclopaedic] | pol,deu,fra,eng | +| [MTEB(jpn)](https://github.com/sbintuitions/JMTEB) | 16 | {'Clustering': 2, 'Classification': 4, 'STS': 2, 'PairClassification': 1, 'Retrieval': 6, 'Reranking': 1} | [Web, Academic, Written, Non-fiction, News, Spoken, Reviews, Encyclopaedic] | jpn | +| MTEB(kor) | 6 | {'Classification': 1, 'Reranking': 1, 'Retrieval': 2, 'STS': 2} | [Web, Written, News, Spoken, Reviews, Encyclopaedic] | kor | +| [MTEB(law)](https://aclanthology.org/2023.eacl-main.148/) | 8 | {'Retrieval': 8} | [Written, Legal] | deu,zho,eng | +| [MTEB(pol)](https://arxiv.org/abs/2405.10138) | 18 | {'Classification': 7, 'Clustering': 3, 'PairClassification': 4, 'STS': 4} | [Web, Fiction, Academic, Social, Written, Non-fiction, Legal, News, Spoken] | pol,deu,fra,eng | +| [MTEB(rus)](https://aclanthology.org/2023.eacl-main.148/) | 23 | {'Classification': 9, 'Clustering': 3, 'MultilabelClassification': 2, 'PairClassification': 1, 'Reranking': 2, 'Retrieval': 3, 'STS': 3} | [Web, Social, Academic, Written, Blog, News, Spoken, Reviews, Encyclopaedic] | rus | +| [NanoBEIR](https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6) | 13 | {'Retrieval': 13} | [Web, Academic, Social, Medical, Written, Non-fiction, News, Encyclopaedic] | eng | +| [RAR-b](https://arxiv.org/abs/2404.06347) | 17 | {'Retrieval': 17} | [Encyclopaedic, Written, Programming] | eng | \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md index f4ec3c792e..15b9474168 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -122,6 +122,8 @@ The following tables give you an overview of the tasks in MTEB. | [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | None | None | +| [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | +| [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | @@ -425,6 +427,12 @@ The following tables give you an overview of the tasks in MTEB. | [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | | [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | | [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, Non-fiction, Web, Written, Spoken, Social, News] | None | None | +| [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | BitextMining | s2s | [Chemistry] | None | None | +| [PubChemSMILESPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | +| [PubChemWikiParagraphsPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | | [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Medical, Government, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | @@ -468,6 +476,8 @@ The following tables give you an overview of the tasks in MTEB. | [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | +| [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SIB200Classification](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Classification | s2s | [News, Written] | None | None | | [SIB200ClusteringS2S](https://arxiv.org/abs/2309.07445) (Adelani et al., 2023) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nqo', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | Clustering | s2s | [News, Written] | None | None | | [SICK-BR-PC](https://linux.ime.usp.br/~thalen/SICK_PT.pdf) | ['por'] | PairClassification | s2s | [Web, Written] | None | None | @@ -595,8 +605,25 @@ The following tables give you an overview of the tasks in MTEB. | [WebLINXCandidatesReranking](https://mcgill-nlp.github.io/weblinx) (Xing Han Lù, 2024) | ['eng'] | Reranking | p2p | [Academic, Web, Written] | None | None | | [WikiCitiesClustering](https://huggingface.co/datasets/wikipedia) | ['eng'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [WikiClusteringP2P.v2](https://github.com/Rysias/wiki-clustering) | ['bos', 'cat', 'ces', 'dan', 'eus', 'glv', 'ilo', 'kur', 'lav', 'min', 'mlt', 'sco', 'sqi', 'wln'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | +| [WikipediaBioMetChemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaBiolumNeurochemClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemEngSpecialtiesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemFieldsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaChemistryTopicsClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaCompChemSpectroscopyClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCryobiologySeparationClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaCrystallographyAnalyticalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaGreenhouseEnantiopureClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaIsotopesFissionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaLuminescenceClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaOrganicInorganicClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WikipediaRerankingMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-reranking-multilingual) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Reranking | s2p | [Encyclopaedic, Written] | {'test': 24000} | {'test': {'num_samples': 24000, 'number_of_characters': 83866932, 'num_positive': 24000, 'num_negative': 192000, 'min_query_length': 7, 'avg_query_length': 59.09, 'max_query_length': 180, 'unique_query': 23997, 'min_positive_length': 100, 'avg_positive_length': 385.45, 'max_positive_length': 3515, 'unique_positive': 23993, 'min_negative_length': 100, 'avg_negative_length': 381.24, 'max_negative_length': 9461, 'unique_negative': 191783, 'hf_subset_descriptive_stats': {'bg': {'num_samples': 1500, 'number_of_characters': 5145316, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 60.83, 'max_query_length': 166, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 375.89, 'max_positive_length': 2241, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 374.19, 'max_negative_length': 4869, 'unique_negative': 11996}, 'bn': {'num_samples': 1500, 'number_of_characters': 5390581, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 7, 'avg_query_length': 47.27, 'max_query_length': 123, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.59, 'max_positive_length': 2338, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 393.98, 'max_negative_length': 5104, 'unique_negative': 11996}, 'cs': {'num_samples': 1500, 'number_of_characters': 5079180, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.27, 'max_query_length': 137, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 383.84, 'max_positive_length': 2300, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 368.25, 'max_negative_length': 3487, 'unique_negative': 11982}, 'da': {'num_samples': 1500, 'number_of_characters': 4746132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 56.75, 'max_query_length': 137, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 351.68, 'max_positive_length': 2159, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.46, 'max_negative_length': 2563, 'unique_negative': 11972}, 'de': {'num_samples': 1500, 'number_of_characters': 5483592, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 20, 'avg_query_length': 70.0, 'max_query_length': 180, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 391.54, 'max_positive_length': 2674, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 399.27, 'max_negative_length': 3083, 'unique_negative': 12000}, 'en': {'num_samples': 1500, 'number_of_characters': 6217884, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 68.37, 'max_query_length': 162, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 451.73, 'max_positive_length': 3515, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 453.14, 'max_negative_length': 3662, 'unique_negative': 12000}, 'fa': {'num_samples': 1500, 'number_of_characters': 4732619, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 12, 'avg_query_length': 48.67, 'max_query_length': 119, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 347.7, 'max_positive_length': 2571, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 344.84, 'max_negative_length': 4707, 'unique_negative': 11978}, 'fi': {'num_samples': 1500, 'number_of_characters': 5209132, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.34, 'max_query_length': 132, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 394.71, 'max_positive_length': 2129, 'unique_positive': 1498, 'min_negative_length': 100, 'avg_negative_length': 377.84, 'max_negative_length': 2574, 'unique_negative': 11972}, 'hi': {'num_samples': 1500, 'number_of_characters': 5620959, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 13, 'avg_query_length': 50.78, 'max_query_length': 125, 'unique_query': 1499, 'min_positive_length': 100, 'avg_positive_length': 420.38, 'max_positive_length': 2361, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 409.52, 'max_negative_length': 5912, 'unique_negative': 11996}, 'it': {'num_samples': 1500, 'number_of_characters': 5420496, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 23, 'avg_query_length': 70.05, 'max_query_length': 156, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 396.97, 'max_positive_length': 2082, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 393.33, 'max_negative_length': 9461, 'unique_negative': 11993}, 'nl': {'num_samples': 1500, 'number_of_characters': 5169556, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.34, 'max_query_length': 136, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 380.79, 'max_positive_length': 1864, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 375.03, 'max_negative_length': 3641, 'unique_negative': 11985}, 'pt': {'num_samples': 1500, 'number_of_characters': 5474356, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 18, 'avg_query_length': 65.12, 'max_query_length': 176, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 404.02, 'max_positive_length': 3057, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 397.55, 'max_negative_length': 2877, 'unique_negative': 11991}, 'ro': {'num_samples': 1500, 'number_of_characters': 4796113, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 61.97, 'max_query_length': 169, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 346.71, 'max_positive_length': 1917, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 348.59, 'max_negative_length': 4213, 'unique_negative': 11971}, 'sr': {'num_samples': 1500, 'number_of_characters': 5271732, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 15, 'avg_query_length': 55.67, 'max_query_length': 146, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 386.35, 'max_positive_length': 2421, 'unique_positive': 1499, 'min_negative_length': 100, 'avg_negative_length': 384.06, 'max_negative_length': 3668, 'unique_negative': 11974}, 'no': {'num_samples': 1500, 'number_of_characters': 5036586, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 14, 'avg_query_length': 55.29, 'max_query_length': 129, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 367.72, 'max_positive_length': 1450, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 366.84, 'max_negative_length': 2841, 'unique_negative': 11996}, 'sv': {'num_samples': 1500, 'number_of_characters': 5072698, 'num_positive': 1500, 'num_negative': 12000, 'min_query_length': 17, 'avg_query_length': 57.73, 'max_query_length': 133, 'unique_query': 1500, 'min_positive_length': 100, 'avg_positive_length': 372.59, 'max_positive_length': 2493, 'unique_positive': 1500, 'min_negative_length': 100, 'avg_negative_length': 368.94, 'max_negative_length': 3680, 'unique_negative': 11999}}}} | | [WikipediaRetrievalMultilingual](https://huggingface.co/datasets/ellamind/wikipedia-2023-11-retrieval-multilingual-queries) | ['ben', 'bul', 'ces', 'dan', 'deu', 'eng', 'fas', 'fin', 'hin', 'ita', 'nld', 'nor', 'por', 'ron', 'srp', 'swe'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [WikipediaSaltsSemiconductorsClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSolidStateColloidalClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | +| [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | +| [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | None | None | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | @@ -819,7 +846,7 @@ The following tables give you an overview of the tasks in MTEB. | cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | | cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 16 | +| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 17 | | cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -875,7 +902,7 @@ The following tables give you an overview of the tasks in MTEB. | dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 | | ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 6 | 2 | 18 | 4 | 0 | 0 | 58 | +| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 59 | | dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -903,7 +930,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | | emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 16 | 143 | 16 | 3 | 1 | 8 | 8 | 105 | 13 | 2 | 1 | 316 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 107 | 13 | 2 | 1 | 343 | | enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -924,7 +951,7 @@ The following tables give you an overview of the tasks in MTEB. | fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 | | fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 5 | 3 | 15 | 4 | 0 | 1 | 57 | +| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 58 | | fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -981,7 +1008,7 @@ The following tables give you an overview of the tasks in MTEB. | hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 | | heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 1 | 2 | 10 | 2 | 0 | 0 | 38 | +| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 39 | | hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1030,7 +1057,7 @@ The following tables give you an overview of the tasks in MTEB. | jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 1 | 3 | 13 | 2 | 0 | 0 | 35 | +| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 36 | | jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | @@ -1086,7 +1113,7 @@ The following tables give you an overview of the tasks in MTEB. | knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 2 | 1 | 9 | 3 | 0 | 0 | 29 | +| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 30 | | kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1231,7 +1258,7 @@ The following tables give you an overview of the tasks in MTEB. | mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | | msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1292,7 +1319,7 @@ The following tables give you an overview of the tasks in MTEB. | nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | | nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 0 | 1 | 2 | 2 | 0 | 0 | 19 | +| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 20 | | nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | @@ -1364,7 +1391,7 @@ The following tables give you an overview of the tasks in MTEB. | poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 | | pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | +| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 28 | | poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1449,7 +1476,7 @@ The following tables give you an overview of the tasks in MTEB. | soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | | soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 2 | 2 | 13 | 4 | 0 | 0 | 43 | +| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 44 | | spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1546,7 +1573,7 @@ The following tables give you an overview of the tasks in MTEB. | tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 2 | 0 | 3 | 2 | 0 | 0 | 19 | +| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 20 | | tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1656,7 +1683,7 @@ The following tables give you an overview of the tasks in MTEB. | zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 1 | 1 | 13 | 0 | 0 | 0 | 20 | +| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 21 | | zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1675,7 +1702,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1394 | 795 | 304 | 3 | 28 | 67 | 51 | 473 | 85 | 2 | 2 | +| Total | None | None | None | 1395 | 812 | 306 | 3 | 28 | 84 | 51 | 475 | 85 | 2 | 2 | diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 07c4f97a04..b04753d877 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -70,6 +70,7 @@ "Web", "Written", "Programming", + "Chemistry", ] SAMPLE_CREATION_METHOD = Literal[ diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 233c7a79b3..415b11eddb 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1232,3 +1232,47 @@ def load_results( primaryClass={cs.CL} }""", ) + +CHEMTEB = Benchmark( + name="ChemTEB", + tasks=get_tasks( + tasks=[ + "PubChemSMILESBitextMining", + "SDSEyeProtectionClassification", + "SDSGlovesClassification", + "WikipediaBioMetChemClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaChemFieldsClassification", + "WikipediaLuminescenceClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaSaltsSemiconductorsClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaCompChemSpectroscopyClassification", + "WikipediaChemEngSpecialtiesClassification", + "WikipediaChemistryTopicsClustering", + "WikipediaSpecialtiesInChemistryClustering", + "PubChemAISentenceParaphrasePC", + "PubChemSMILESPC", + "PubChemSynonymPC", + "PubChemWikiParagraphsPC", + "PubChemWikiPairClassification", + "ChemNQRetrieval", + "ChemHotpotQARetrieval", + ], + ), + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} +}""", +) diff --git a/mteb/models/bedrock_models.py b/mteb/models/bedrock_models.py new file mode 100644 index 0000000000..4616209df1 --- /dev/null +++ b/mteb/models/bedrock_models.py @@ -0,0 +1,264 @@ +from __future__ import annotations + +import json +import logging +import re +from functools import partial +from typing import Any + +import numpy as np +import tqdm + +from mteb.encoder_interface import PromptType +from mteb.model_meta import ModelMeta +from mteb.models.cohere_models import model_prompts as cohere_model_prompts +from mteb.models.cohere_models import supported_languages as cohere_supported_languages +from mteb.requires_package import requires_package + +from .wrapper import Wrapper + +logger = logging.getLogger(__name__) + + +class BedrockWrapper(Wrapper): + def __init__( + self, + model_id: str, + provider: str, + max_tokens: int, + model_prompts: dict[str, str] | None = None, + **kwargs, + ) -> None: + requires_package(self, "boto3", "The AWS SDK for Python") + import boto3 + + boto3_session = boto3.session.Session() + region_name = boto3_session.region_name + self._client = boto3.client("bedrock-runtime", region_name) + + self._model_id = model_id + self._provider = provider.lower() + + if self._provider == "cohere": + self.model_prompts = ( + self.validate_task_to_prompt_name(model_prompts) + if model_prompts + else None + ) + self._max_batch_size = 96 + self._max_sequence_length = max_tokens * 4 + else: + self._max_tokens = max_tokens + + def encode( + self, + sentences: list[str], + *, + task_name: str | None = None, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + requires_package(self, "boto3", "Amazon Bedrock") + show_progress_bar = ( + False + if "show_progress_bar" not in kwargs + else kwargs.pop("show_progress_bar") + ) + if self._provider == "amazon": + return self._encode_amazon(sentences, show_progress_bar) + elif self._provider == "cohere": + prompt_name = self.get_prompt_name( + self.model_prompts, task_name, prompt_type + ) + cohere_task_type = self.model_prompts.get(prompt_name, "search_document") + return self._encode_cohere(sentences, cohere_task_type, show_progress_bar) + else: + raise ValueError( + f"Unknown provider '{self._provider}'. Must be 'amazon' or 'cohere'." + ) + + def _encode_amazon( + self, sentences: list[str], show_progress_bar: bool = False + ) -> np.ndarray: + from botocore.exceptions import ValidationError + + all_embeddings = [] + # https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html + max_sequence_length = int(self._max_tokens * 4.5) + + for sentence in tqdm.tqdm( + sentences, leave=False, disable=not show_progress_bar + ): + if len(sentence) > max_sequence_length: + truncated_sentence = sentence[:max_sequence_length] + else: + truncated_sentence = sentence + + try: + embedding = self._embed_amazon(truncated_sentence) + all_embeddings.append(embedding) + + except ValidationError as e: + error_str = str(e) + pattern = r"request input token count:\s*(\d+)" + match = re.search(pattern, error_str) + if match: + num_tokens = int(match.group(1)) + + ratio = 0.9 * (self._max_tokens / num_tokens) + dynamic_cutoff = int(len(truncated_sentence) * ratio) + + embedding = self._embed_amazon(truncated_sentence[:dynamic_cutoff]) + all_embeddings.append(embedding) + else: + raise e + + return np.array(all_embeddings) + + def _encode_cohere( + self, + sentences: list[str], + cohere_task_type: str, + show_progress_bar: bool = False, + ) -> np.ndarray: + batches = [ + sentences[i : i + self._max_batch_size] + for i in range(0, len(sentences), self._max_batch_size) + ] + + all_embeddings = [] + + for batch in tqdm.tqdm(batches, leave=False, disable=not show_progress_bar): + response = self._client.invoke_model( + body=json.dumps( + { + "texts": [sent[: self._max_sequence_length] for sent in batch], + "input_type": cohere_task_type, + } + ), + modelId=self._model_id, + accept="*/*", + contentType="application/json", + ) + all_embeddings.extend(self._to_numpy(response)) + + return np.array(all_embeddings) + + def _embed_amazon(self, sentence: str) -> np.ndarray: + response = self._client.invoke_model( + body=json.dumps({"inputText": sentence}), + modelId=self._model_id, + accept="application/json", + contentType="application/json", + ) + return self._to_numpy(response) + + def _to_numpy(self, embedding_response) -> np.ndarray: + response = json.loads(embedding_response.get("body").read()) + key = "embedding" if self._provider == "amazon" else "embeddings" + return np.array(response[key]) + + +amazon_titan_embed_text_v1 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v1", + revision="1", + release_date="2023-09-27", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v1", + provider="amazon", + max_tokens=8192, + ), + max_tokens=8192, + embed_dim=1536, + open_weights=False, + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2023/09/amazon-titan-embeddings-generally-available/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) + +amazon_titan_embed_text_v2 = ModelMeta( + name="bedrock/amazon-titan-embed-text-v2", + revision="1", + release_date="2024-04-30", + languages=None, # not specified + loader=partial( + BedrockWrapper, + model_id="amazon.titan-embed-text-v2:0", + provider="amazon", + max_tokens=8192, + ), + max_tokens=8192, + embed_dim=1024, + open_weights=False, + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + license=None, + reference="https://aws.amazon.com/about-aws/whats-new/2024/04/amazon-titan-text-embeddings-v2-amazon-bedrock/", + similarity_fn_name="cosine", + framework=["API"], + use_instructions=False, +) +# Note: For the original Cohere API implementation, refer to: +# https://github.com/embeddings-benchmark/mteb/blob/main/mteb/models/cohere_models.py +# This implementation uses the Amazon Bedrock endpoint for Cohere models. +cohere_embed_english_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-english-v3", + provider="cohere", + max_tokens=512, + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-english-v3", + languages=["eng-Latn"], + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) + +cohere_embed_multilingual_v3 = ModelMeta( + loader=partial( + BedrockWrapper, + model_id="cohere.embed-multilingual-v3", + provider="cohere", + max_tokens=512, + model_prompts=cohere_model_prompts, + ), + name="bedrock/cohere-embed-multilingual-v3", + languages=cohere_supported_languages, + open_weights=False, + reference="https://cohere.com/blog/introducing-embed-v3", + revision="1", + release_date="2023-11-02", + n_parameters=None, + public_training_code=None, + public_training_data=None, # assumed + training_datasets=None, + max_tokens=512, + embed_dim=1024, + license=None, + similarity_fn_name="cosine", + framework=["API"], + use_instructions=True, +) diff --git a/mteb/models/overview.py b/mteb/models/overview.py index ccc8fbdda4..399d8359c5 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -13,6 +13,7 @@ from mteb.model_meta import ModelMeta from mteb.models import ( arctic_models, + bedrock_models, bge_models, bm25, cde_models, @@ -100,6 +101,7 @@ uae_models, text2vec_models, stella_models, + bedrock_models, uae_models, voyage_models, ] diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index c176077215..1cec5d5ddc 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from .dan.BornholmskBitextMining import * +from .eng.PubChemSMILESBitextMining import * from .kat.TbilisiCityHallBitextMining import * from .multilingual.BibleNLPBitextMining import * from .multilingual.BUCCBitextMining import * diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py new file mode 100644 index 0000000000..4951d8c596 --- /dev/null +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +COL_MAPPING = { + "iso-title": {"title": "sentence1", "isomeric_smiles": "sentence2"}, + "iso-desc": {"description": "sentence1", "isomeric_smiles": "sentence2"}, + "canon-title": {"title": "sentence1", "canonical_smiles": "sentence2"}, + "canon-desc": {"description": "sentence1", "canonical_smiles": "sentence2"}, +} + +EVAL_LANGS = { + "iso-title": ["eng-Latn", "eng-Latn"], + "iso-desc": ["eng-Latn", "eng-Latn"], + "canon-title": ["eng-Latn", "eng-Latn"], + "canon-desc": ["eng-Latn", "eng-Latn"], +} + + +class PubChemSMILESBitextMining(MultilingualTask, AbsTaskBitextMining): + metadata = TaskMetadata( + name="PubChemSMILESBitextMining", + dataset={ + "path": "BASF-AI/PubChemSMILESBitextMining", + "revision": "36700ea628118312ebf2f90ad2353a9a8f188dc9", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="BitextMining", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=EVAL_LANGS, + main_score="f1", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + for subset in self.hf_subsets: + self.dataset[subset] = self.dataset[subset].rename_columns( + COL_MAPPING[subset] + ) diff --git a/mteb/tasks/BitextMining/eng/__init__.py b/mteb/tasks/BitextMining/eng/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 3e80ae2181..12b0623b6b 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -33,10 +33,27 @@ from .eng.NewsClassification import * from .eng.PatentClassification import * from .eng.PoemSentimentClassification import * +from .eng.SDSEyeProtectionClassification import * +from .eng.SDSGlovesClassification import * from .eng.ToxicChatClassification import * from .eng.ToxicConversationsClassification import * from .eng.TweetSentimentExtractionClassification import * from .eng.TweetTopicSingleClassification import * +from .eng.WikipediaBiolumNeurochemClassification import * +from .eng.WikipediaBioMetChemClassification import * +from .eng.WikipediaChemEngSpecialtiesClassification import * +from .eng.WikipediaChemFieldsClassification import * +from .eng.WikipediaChemistryTopicsClassification import * +from .eng.WikipediaCompChemSpectroscopyClassification import * +from .eng.WikipediaCryobiologySeparationClassification import * +from .eng.WikipediaCrystallographyAnalyticalClassification import * +from .eng.WikipediaGreenhouseEnantiopureClassification import * +from .eng.WikipediaIsotopesFissionClassification import * +from .eng.WikipediaLuminescenceClassification import * +from .eng.WikipediaOrganicInorganicClassification import * +from .eng.WikipediaSaltsSemiconductorsClassification import * +from .eng.WikipediaSolidStateColloidalClassification import * +from .eng.WikipediaTheoreticalAppliedClassification import * from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py new file mode 100644 index 0000000000..197060ba0c --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSEyeProtectionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSEyeProtectionClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/SDSEyeProtectionClassification", + "revision": "35cbe5ee544dd26e343238a333de4568e6f77819", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py new file mode 100644 index 0000000000..ac471d58e9 --- /dev/null +++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SDSGlovesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SDSGlovesClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/SDSGlovesClassification", + "revision": "c723236c5ec417d79512e6104aca9d2cd88168f6", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{pereira2020msds, + title={MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets}, + author={Pereira, Eliseu}, + booktitle={15th Doctoral Symposium}, + pages={42}, + year={2020} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py new file mode 100644 index 0000000000..3b494f46f6 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBioMetChemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBioMetChemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GeneExpressionVsMetallurgyClassification", + "revision": "6ac491e5de9070c6dd434b31e76d3d379123dcff", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py new file mode 100644 index 0000000000..623ec8fc66 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaBiolumNeurochemClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaBiolumNeurochemClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaBiolumNeurochemClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2BioluminescenceVsNeurochemistryClassification", + "revision": "2f68b7d34c2be896e46b14533573b366e59e5aae", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py new file mode 100644 index 0000000000..c95abcd4f2 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemEngSpecialtiesClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemEngSpecialtiesClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemEngSpecialtiesClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Classification", + "revision": "f81a76a2fb690e5d5bd7a26dd07e85cdf8405dfb", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py new file mode 100644 index 0000000000..7c0179fb1e --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemFieldsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemFieldsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ10Classification", + "revision": "a75fae77759acc115f015f2b856baa47776d733d", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py new file mode 100644 index 0000000000..02751b1a32 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaChemistryTopicsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistryTopicsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy10Classification", + "revision": "d8fb355db2248f95df8ea410a43aa1db1ee96ba4", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py new file mode 100644 index 0000000000..28a42ac044 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCompChemSpectroscopyClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2ComputationalVsSpectroscopistsClassification", + "revision": "474d706a22b0451b5846d623aa4b4234ba5b0513", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py new file mode 100644 index 0000000000..0e01454298 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCryobiologySeparationClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCryobiologySeparationClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCryobiologySeparationClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy5Classification", + "revision": "858633e882dadd1ec6a0d220f7549bcafd379236", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py new file mode 100644 index 0000000000..724ffc4249 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaCrystallographyAnalyticalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification", + "revision": "740565a6a853aaed1114a13bdfd5fd46857b4f11", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py new file mode 100644 index 0000000000..b701584a70 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaGreenhouseEnantiopureClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaGreenhouseEnantiopureClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaGreenhouseEnantiopureClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2GreenhouseVsEnantiopureClassification", + "revision": "0cfc1a83b6ed832454e8f4f93f7a0e26208274d9", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py new file mode 100644 index 0000000000..252ad85ed9 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaIsotopesFissionClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaIsotopesFissionClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaIsotopesFissionClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification", + "revision": "897743346c7c794264f7dbfadc3978aa2895e8e2", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py new file mode 100644 index 0000000000..8e115b59d4 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaLuminescenceClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaLuminescenceClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaLuminescenceClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2BioluminescenceVsLuminescenceClassification", + "revision": "21c4dcebe2c5b36a35292e6441e7a10b59bf4896", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py new file mode 100644 index 0000000000..0ad784b69b --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaOrganicInorganicClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaOrganicInorganicClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaOrganicInorganicClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SpecialClassification", + "revision": "96d1d9b37c4693f74c46c83d63a290573f78d511", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py new file mode 100644 index 0000000000..a409f87c8d --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSaltsSemiconductorsClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSaltsSemiconductorsClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSaltsSemiconductorsClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaHard2SaltsVsSemiconductorMaterialsClassification", + "revision": "9e5415a096012fa2d1f3a929952cf9859e4550e7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py new file mode 100644 index 0000000000..43f95c50f3 --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaSolidStateColloidalClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaSolidStateColloidalClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaSolidStateColloidalClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy2SolidStateVsColloidalClassification", + "revision": "7d8df44e588b6143d4856c781f72f919fa0599a7", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py new file mode 100644 index 0000000000..f33b02f4bb --- /dev/null +++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaTheoreticalAppliedClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="WikipediaTheoreticalAppliedClassification", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEZ2Classification", + "revision": "7896906653d31d7102a143d7f55d67cd688e3147", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="accuracy", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 014796a4cb..de27839290 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -18,6 +18,8 @@ from .eng.StackExchangeClusteringP2P import * from .eng.TwentyNewsgroupsClustering import * from .eng.WikiCitiesClustering import * +from .eng.WikipediaChemistrySpecialtiesClustering import * +from .eng.WikipediaChemistryTopicsClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py new file mode 100644 index 0000000000..a4e4082a69 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistrySpecialtiesClustering.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistrySpecialtiesClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaSpecialtiesInChemistryClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaMedium5Clustering", + "revision": "7754d8d296f9f4c3af1c6426fab36304730ccddf", + }, + type="Clustering", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py new file mode 100644 index 0000000000..bfa5e1fcf3 --- /dev/null +++ b/mteb/tasks/Clustering/eng/WikipediaChemistryTopicsClustering.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class WikipediaChemistryTopicsClustering(AbsTaskClustering): + metadata = TaskMetadata( + name="WikipediaChemistryTopicsClustering", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/WikipediaEasy10Clustering", + "revision": "0a0886b06acbfc735bca6a71b21ce1e5cb92a37b", + }, + type="Clustering", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="v_measure", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + """, + ) diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index c2057a4952..e229195df0 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -4,6 +4,10 @@ from .ces.CTKFactsNLI import * from .deu.FalseFriendsDeEnPC import * from .eng.LegalBenchPC import * +from .eng.PubChemAISentenceParaphrasePC import * +from .eng.PubChemSMILESPC import * +from .eng.PubChemSynonymPC import * +from .eng.PubChemWikiParagraphsPC import * from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * @@ -13,6 +17,7 @@ from .kor.KlueNLI import * from .multilingual.OpusparcusPC import * from .multilingual.PawsXPairClassification import * +from .multilingual.PubChemWikiPairClassification import * from .multilingual.RTE3 import * from .multilingual.XNLI import * from .multilingual.XStance import * diff --git a/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py new file mode 100644 index 0000000000..f453ebee31 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemAISentenceParaphrasePC.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemAISentenceParaphrasePC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemAISentenceParaphrasePC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemAISentenceParaphrasePC", + "revision": "f33a205966ce032f957c3a22f4f9e378f89a2c56", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="LM-generated", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py new file mode 100644 index 0000000000..b3e297e043 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + +_DATASET_COLUMN_MAP = [ + { + "name": "iso-desc", + "sent1": "description", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "iso-title", + "sent1": "title", + "sent2": "isomeric_smiles", + "labels": "labels", + }, + { + "name": "canon-desc", + "sent1": "description", + "sent2": "canonical_smiles", + "labels": "labels", + }, + { + "name": "canon-title", + "sent1": "title", + "sent2": "canonical_smiles", + "labels": "labels", + }, +] + + +class PubChemSMILESPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSMILESPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemSMILESPairClassification", + "revision": "7ba40b69f5fe6ffe4cc189aac9e1710913c73c8a", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def load_data(self): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + + _hf_dataset = None + for dataset_col_map in _DATASET_COLUMN_MAP: + _dataset = datasets.load_dataset( + self.metadata.dataset["path"], + dataset_col_map["name"], + revision=self.metadata.dataset["revision"], + ) + + _dataset = _dataset.rename_columns( + { + dataset_col_map["sent1"]: "sentence1", + dataset_col_map["sent2"]: "sentence2", + dataset_col_map["labels"]: "labels", + } + ) + + if _hf_dataset is None: + _hf_dataset = _dataset + else: + _hf_dataset["test"] = datasets.concatenate_datasets( + [_hf_dataset["test"], _dataset["test"]] + ) + + self.dataset = _hf_dataset + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=self.metadata["eval_splits"], + label="labels", + ) + + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sentence1"], + "sentence2": hf_dataset["sentence2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py new file mode 100644 index 0000000000..6b6dfd81c8 --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemSynonymPC.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemSynonymPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemSynonymPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemSynonymPC", + "revision": "5037d69d177c9628fb79cb57eea1299178b28c1b", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["title"], + "sentence2": hf_dataset["synonyms"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py new file mode 100644 index 0000000000..679580f28c --- /dev/null +++ b/mteb/tasks/PairClassification/eng/PubChemWikiParagraphsPC.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class PubChemWikiParagraphsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="PubChemWikiParagraphsPC", + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + dataset={ + "path": "BASF-AI/PubChemWikiParagraphsPC", + "revision": "7fb14716e4106b72f51a16e682e5cd2d67e9bd70", + }, + type="PairClassification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py new file mode 100644 index 0000000000..59a0605a82 --- /dev/null +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGES = { + "de": ["deu-Latn", "eng-Latn"], + "nl": ["nld-Latn", "eng-Latn"], + "zh": ["zho-Hans", "eng-Latn"], + "fr": ["fra-Latn", "eng-Latn"], + "es": ["spa-Latn", "eng-Latn"], + "pt": ["por-Latn", "eng-Latn"], + "ms": ["msa-Latn", "eng-Latn"], + "ko": ["kor-Hang", "eng-Latn"], + "tr": ["tur-Latn", "eng-Latn"], + "hi": ["hin-Deva", "eng-Latn"], + "cs": ["ces-Latn", "eng-Latn"], + "ja": ["jpn-Jpan", "eng-Latn"], +} + + +class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask): + metadata = TaskMetadata( + name="PubChemWikiPairClassification", + dataset={ + "path": "BASF-AI/PubChemWikiMultilingualPC", + "revision": "3412b208896a37e4ebb5ff7b96f6cc313ee9d2e3", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + category="s2s", + modalities=["text"], + type="PairClassification", + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="max_ap", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{kim2023pubchem, + title={PubChem 2023 update}, + author={Kim, Sunghwan and Chen, Jie and Cheng, Tiejun and Gindulyte, Asta and He, Jia and He, Siqian and Li, Qingliang and Shoemaker, Benjamin A and Thiessen, Paul A and Yu, Bo and others}, + journal={Nucleic acids research}, + volume={51}, + number={D1}, + pages={D1373--D1380}, + year={2023}, + publisher={Oxford University Press} + } + """, + ) + + def dataset_transform(self) -> None: + _dataset = {} + for lang in self.hf_subsets: + _dataset[lang] = {} + hf_dataset = self.dataset[lang][self.metadata.eval_splits[0]] + _dataset[lang]["test"] = [ + { + "sentence1": hf_dataset["sent1"], + "sentence2": hf_dataset["sent2"], + "labels": hf_dataset["labels"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index d83df7ec5e..6c146379ea 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -29,6 +29,8 @@ from .eng.ARCChallengeRetrieval import * from .eng.ArguAnaRetrieval import * from .eng.BrightRetrieval import * +from .eng.ChemHotpotQARetrieval import * +from .eng.ChemNQRetrieval import * from .eng.ClimateFEVERRetrieval import * from .eng.CQADupstackAndroidRetrieval import * from .eng.CQADupstackEnglishRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py new file mode 100644 index 0000000000..88fbc50df4 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemHotpotQARetrieval.py @@ -0,0 +1,60 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ChemHotpotQARetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemHotpotQARetrieval", + dataset={ + "path": "BASF-AI/ChemHotpotQARetrieval", + "revision": "1840e8a5ac6ec752bbdd97d543ead0189bc7c25b", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["train", "dev", "test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @inproceedings{yang-etal-2018-hotpotqa, + title = "{H}otpot{QA}: A Dataset for Diverse, Explainable Multi-hop Question Answering", + author = "Yang, Zhilin and + Qi, Peng and + Zhang, Saizheng and + Bengio, Yoshua and + Cohen, William and + Salakhutdinov, Ruslan and + Manning, Christopher D.", + editor = "Riloff, Ellen and + Chiang, David and + Hockenmaier, Julia and + Tsujii, Jun{'}ichi", + booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing", + month = oct # "-" # nov, + year = "2018", + address = "Brussels, Belgium", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/D18-1259", + doi = "10.18653/v1/D18-1259", + pages = "2369--2380", + abstract = "Existing question answering (QA) datasets fail to train QA systems to perform complex reasoning and provide explanations for answers. We introduce HotpotQA, a new dataset with 113k Wikipedia-based question-answer pairs with four key features: (1) the questions require finding and reasoning over multiple supporting documents to answer; (2) the questions are diverse and not constrained to any pre-existing knowledge bases or knowledge schemas; (3) we provide sentence-level supporting facts required for reasoning, allowing QA systems to reason with strong supervision and explain the predictions; (4) we offer a new type of factoid comparison questions to test QA systems{'} ability to extract relevant facts and perform necessary comparison. We show that HotpotQA is challenging for the latest QA systems, and the supporting facts enable models to improve performance and make explainable predictions.", + } +""", + ) diff --git a/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py new file mode 100644 index 0000000000..1e77971331 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/ChemNQRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ChemNQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ChemNQRetrieval", + dataset={ + "path": "BASF-AI/ChemNQRetrieval", + "revision": "5d958fb6b31055495347724d46431ba41309b03a", + }, + description="ChemTEB evaluates the performance of text embedding models on chemical domain data.", + reference="https://arxiv.org/abs/2412.00532", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2024-06-01", "2024-11-30"), + domains=["Chemistry"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" + @article{kasmaee2024chemteb, + title={ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain}, + author={Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila}, + journal={arXiv preprint arXiv:2412.00532}, + year={2024} + } + @article{47761, + title = {Natural Questions: a Benchmark for Question Answering Research}, + author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh + and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee + and Kristina N. Toutanova and Llion Jones and Ming-Wei Chang and Andrew Dai and Jakob Uszkoreit and Quoc Le + and Slav Petrov}, + year = {2019}, + journal = {Transactions of the Association of Computational Linguistics}} + """, + ) From 223bf324c213f222785bbf2db88e30c8069c610b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 25 Jan 2025 03:53:39 +0000 Subject: [PATCH 002/275] Update points table --- docs/mmteb/points_table.md | 200 ++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/docs/mmteb/points_table.md b/docs/mmteb/points_table.md index cd166890d7..dfb4a6b31c 100644 --- a/docs/mmteb/points_table.md +++ b/docs/mmteb/points_table.md @@ -2,103 +2,103 @@ _Note_: this table is **autogenerated** and should not be edited. It is intended to get an overview of contributions. - | GitHub | New dataset | Review PR | Bug fixes | Coordination | Paper writing | Dataset annotations | Running Models | New task | Total | -|:------------------|--------------:|------------:|------------:|---------------:|----------------:|----------------------:|-----------------:|-----------:|--------:| -| KennethEnevoldsen | 68 | 326 | 87 | 81 | 0 | 35 | 0 | 0 | 597 | -| isaac-chung | 120 | 194 | 50 | 54 | 12 | 1 | 0 | 2 | 433 | -| imenelydiaker | 120 | 144 | 24 | 70 | 0 | 0 | 0 | 0 | 358 | -| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | -| x-tabdeveloping | 144 | 32 | 10 | 41 | 0 | 0 | 0 | 12 | 239 | -| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | -| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | -| wissam-sib | 134 | 6 | 4 | 0 | 0 | 0 | 0 | 0 | 144 | -| Muennighoff | 0 | 48 | 0 | 70 | 0 | 0 | 24 | 0 | 142 | -| orionw | 0 | 20 | 20 | 75 | 0 | 0 | 0 | 10 | 125 | -| dokato | 94 | 6 | 12 | 0 | 0 | 0 | 0 | 0 | 112 | -| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | -| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | -| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | -| vaibhavad | 6 | 4 | 8 | 75 | 0 | 0 | 0 | 0 | 93 | -| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | -| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | -| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | -| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | -| Samoed | 18 | 2 | 22 | 0 | 0 | 0 | 9 | 0 | 51 | -| sivareddyg | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 0 | 50 | -| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | -| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | -| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | -| MartinBernstorff | 2 | 8 | 13 | 20 | 0 | 0 | 0 | 0 | 43 | -| akshita-sukhlecha | 36 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 40 | -| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | -| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | -| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | -| loicmagne | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 0 | 28 | -| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | -| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | -| mariyahendriksen | 0 | 0 | 0 | 0 | 24 | 0 | 0 | 0 | 24 | -| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | -| hgissbkh | 0 | 2 | 13 | 0 | 3 | 0 | 0 | 5 | 23 | -| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | -| jankounchained | 14 | 0 | 8 | 0 | 0 | 0 | 0 | 0 | 22 | -| tomaarsen | 0 | 2 | 0 | 20 | 0 | 0 | 0 | 0 | 22 | -| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | -| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | -| john-b-yang | 0 | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 20 | -| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | -| ManuelFay | 2 | 0 | 13 | 0 | 0 | 0 | 0 | 5 | 20 | -| AlexeyVatolin | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 0 | 20 | -| Andrian0s | 14 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 20 | -| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | -| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | -| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | -| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | -| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | -| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | -| jordiclive | 2 | 0 | 10 | 0 | 0 | 0 | 0 | 0 | 12 | -| xhluca | 6 | 2 | 4 | 0 | 0 | 0 | 0 | 0 | 12 | -| henilp105 | 0 | 0 | 2 | 0 | 0 | 9 | 0 | 0 | 11 | -| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | -| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | -| tmp_handle | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 10 | -| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | -| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sarahooker | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | -| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| HLasse | 0 | 0 | 5 | 0 | 0 | 5 | 0 | 0 | 10 | -| cassanof | 8 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 10 | -| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| antoniolanza1996 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 2 | -| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file + | GitHub | New dataset | Review PR | Running Models | Bug fixes | Coordination | Dataset annotations | Paper writing | New task | Total | +|:------------------|--------------:|------------:|-----------------:|------------:|---------------:|----------------------:|----------------:|-----------:|--------:| +| KennethEnevoldsen | 68 | 326 | 0 | 87 | 81 | 35 | 0 | 0 | 597 | +| isaac-chung | 120 | 194 | 0 | 50 | 54 | 1 | 12 | 2 | 433 | +| imenelydiaker | 120 | 144 | 0 | 24 | 70 | 0 | 0 | 0 | 358 | +| awinml | 300 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 302 | +| x-tabdeveloping | 144 | 32 | 0 | 10 | 41 | 0 | 0 | 12 | 239 | +| davidstap | 176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 176 | +| jaygala24 | 149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 149 | +| wissam-sib | 134 | 6 | 0 | 4 | 0 | 0 | 0 | 0 | 144 | +| Muennighoff | 0 | 48 | 24 | 0 | 70 | 0 | 0 | 0 | 142 | +| orionw | 0 | 20 | 0 | 20 | 75 | 0 | 0 | 10 | 125 | +| dokato | 94 | 6 | 0 | 12 | 0 | 0 | 0 | 0 | 112 | +| gentaiscool | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 110 | +| jupyterjazz | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 108 | +| SaitejaUtpala | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 102 | +| vaibhavad | 6 | 4 | 0 | 8 | 75 | 0 | 0 | 0 | 93 | +| schmarion | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| MathieuCiancone | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| GabrielSequeira | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 88 | +| digantamisra98 | 71 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 71 | +| shreeya-dhakal | 54 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 62 | +| Rysias | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 58 | +| Samoed | 18 | 2 | 9 | 22 | 0 | 0 | 0 | 0 | 51 | +| sivareddyg | 0 | 0 | 0 | 0 | 50 | 0 | 0 | 0 | 50 | +| gowitheflow-1998 | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 50 | +| asparius | 34 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 48 | +| Akash190104 | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 46 | +| MartinBernstorff | 2 | 8 | 0 | 13 | 20 | 0 | 0 | 0 | 43 | +| akshita-sukhlecha | 36 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 40 | +| staoxiao | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 40 | +| bp-high | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| rafalposwiata | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 36 | +| KranthiGV | 20 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 34 | +| loicmagne | 0 | 0 | 0 | 28 | 0 | 0 | 0 | 0 | 28 | +| ShawonAshraf | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| bjoernpl | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| jphme | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| rasdani | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | +| violenil | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| mariyahendriksen | 0 | 0 | 0 | 0 | 0 | 0 | 24 | 0 | 24 | +| dwzhu-pku | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| hgissbkh | 0 | 2 | 0 | 13 | 0 | 0 | 3 | 5 | 23 | +| taeminlee | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| kwojtasi | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 22 | +| jankounchained | 14 | 0 | 0 | 8 | 0 | 0 | 0 | 0 | 22 | +| tomaarsen | 0 | 2 | 0 | 0 | 20 | 0 | 0 | 0 | 22 | +| crystina-z | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | +| mrshu | 16 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 21 | +| john-b-yang | 0 | 0 | 0 | 0 | 0 | 0 | 20 | 0 | 20 | +| rbroc | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| mmhamdy | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 20 | +| ManuelFay | 2 | 0 | 0 | 13 | 0 | 0 | 0 | 5 | 20 | +| AlexeyVatolin | 0 | 0 | 0 | 20 | 0 | 0 | 0 | 0 | 20 | +| Andrian0s | 14 | 4 | 0 | 2 | 0 | 0 | 0 | 0 | 20 | +| thakur-nandan | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| manandey | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 18 | +| PranjalChitale | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| dipam7 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| sted97 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| Sakshamrzt | 12 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| taidnguyen | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 14 | +| artemsnegirev | 12 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 14 | +| slvnwhrl | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| anpalmak2003 | 9 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 12 | +| Art3mis07 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| guenthermi | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 12 | +| jordiclive | 2 | 0 | 0 | 10 | 0 | 0 | 0 | 0 | 12 | +| xhluca | 6 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 12 | +| henilp105 | 0 | 0 | 0 | 2 | 0 | 9 | 0 | 0 | 11 | +| MariyaTikhonova | 7 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 11 | +| ab1992ao | 8 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 11 | +| tmp_handle | 0 | 0 | 0 | 0 | 10 | 0 | 0 | 0 | 10 | +| swj0419 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Ruqyai | 2 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ZhengLiu101 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| Alenush | 6 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 10 | +| ABorghini | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| simon-clematide | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sarahooker | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 10 | +| guangyusong | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| HLasse | 0 | 0 | 0 | 5 | 0 | 5 | 0 | 0 | 10 | +| cassanof | 8 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 10 | +| hongjin-su | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xiamengzhou | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| xu3kev | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| howard-yen | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| malteos | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| ljvmiranda921 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| marcobellagente93 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| izhx | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| MexicanLemonade | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| antoniolanza1996 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 2 | +| achibb | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| NouamaneTazi | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| PhilipMay | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cslizc | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bakrianoo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| hanhainebula | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| monikernemo | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | \ No newline at end of file From dfba463a945314ef8b2f9132b3a547b81d04dd5d Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 25 Jan 2025 04:05:16 +0000 Subject: [PATCH 003/275] 1.30.0 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 70ad64bd07..2883e6596a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.29.16" +version = "1.30.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From ee0f15ad03313d3a030c6f21ae6aafd9bc95bbb0 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Sat, 25 Jan 2025 19:41:41 +0300 Subject: [PATCH 004/275] feat: add instruct wrapper (#1768) * add instruct wrapper * use get_task_instruction * add logging messages * apply based on PromptType * update description * change example model * move nvembed * Update mteb/models/instruct_wrapper.py Co-authored-by: Isaac Chung * update docstrings * add instruction to docs * Apply suggestions from code review Co-authored-by: Isaac Chung * lint --------- Co-authored-by: Isaac Chung --- docs/adding_a_model.md | 15 ++++++ mteb/models/instruct_wrapper.py | 85 +++++++++++++++++++++++++++++++++ mteb/models/nvidia_models.py | 75 +++++++---------------------- 3 files changed, 116 insertions(+), 59 deletions(-) diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index 0e22e1ea3a..314c6e9c39 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -71,3 +71,18 @@ The leaderboard [automatically refreshes daily](https://github.com/embeddings-be ###### Instantiating the Model with Prompts If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. + +##### Adding instruction models + +Models that use instructions can use the [`InstructSentenceTransformerWrapper`](../mteb/models/instruct_wrapper.py). For example: +```python +model = ModelMeta( + loader=partial( + InstructSentenceTransformerWrapper, + model="nvidia/NV-Embed-v1", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template="Instruct: {instruction}\nQuery: ", + ), + ... +) +``` \ No newline at end of file diff --git a/mteb/models/instruct_wrapper.py b/mteb/models/instruct_wrapper.py index 2ee3a09b56..cc6e814629 100644 --- a/mteb/models/instruct_wrapper.py +++ b/mteb/models/instruct_wrapper.py @@ -6,6 +6,7 @@ import numpy as np import torch +from sentence_transformers import SentenceTransformer from mteb.encoder_interface import PromptType @@ -78,3 +79,87 @@ def encode( return embeddings return InstructWrapper(model_name_or_path, mode, instruction_template, **kwargs) + + +class InstructSentenceTransformerWrapper(Wrapper): + def __init__( + self, + model_name: str, + revision: str, + instruction_template: str | Callable[[str], str] | None = None, + max_seq_length: int | None = None, + apply_instruction_to_passages: bool = True, + padding_side: str | None = None, + add_eos_token: bool = False, + **kwargs: Any, + ): + """Instruct Sentence Transformer Wrapper. Wrapper that passes instructions to the Sentence Transformer model. + Applied for models like NV-Embed, gte-Qwen, e5-mistral, etc. + + Arguments: + model_name: Model name of the sentence transformers model. + revision: Revision of the sentence transformers model. + instruction_template: Model template. Should contain the string '{instruction}'. + max_seq_length: Maximum sequence length. If None, the maximum sequence length will be read from the model config. + apply_instruction_to_passages: Whether to apply the instruction template to the passages. + padding_side: Padding side. If None, the padding side will be read from the model config. + add_eos_token: Whether to add the eos token to each input example. + **kwargs: Kwargs for Sentence Transformer model. + """ + if ( + isinstance(instruction_template, str) + and "{instruction}" not in instruction_template + ): + raise ValueError( + "Instruction template must contain the string '{instruction}'." + ) + if instruction_template is None: + logger.warning( + "No instruction template provided. Instructions will be used as-is." + ) + + self.model_name = model_name + self.model = SentenceTransformer(model_name, revision=revision, **kwargs) + self.instruction_template = instruction_template + self.apply_instruction_to_passages = apply_instruction_to_passages + self.add_eos_token = add_eos_token + if max_seq_length is not None: + self.model.max_seq_length = max_seq_length + if padding_side is not None: + self.model.tokenizer.padding_side = padding_side + + def encode( + self, + sentences: Sequence[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs: Any, + ) -> np.ndarray: + if self.add_eos_token: + sentences = [ + example + self.model.tokenizer.eos_token for example in sentences + ] + + instruction = self.get_task_instruction(task_name, prompt_type) + + # to passage prompts won't be applied to passages + if not self.apply_instruction_to_passages and prompt_type == PromptType.passage: + instruction = None + logger.info( + f"No instruction used, because prompt type = {prompt_type.passage}" + ) + + if instruction: + logger.info(f"Using instruction: '{instruction}' for task: '{task_name}'") + + embeddings = self.model.encode( + sentences, + prompt=instruction, + **kwargs, + ) + + if isinstance(embeddings, torch.Tensor): + # sometimes in kwargs can be return_tensors=True + embeddings = embeddings.cpu().detach().float().numpy() + return embeddings diff --git a/mteb/models/nvidia_models.py b/mteb/models/nvidia_models.py index 1997a85274..f3b313356a 100644 --- a/mteb/models/nvidia_models.py +++ b/mteb/models/nvidia_models.py @@ -1,17 +1,11 @@ from __future__ import annotations import logging -from collections.abc import Sequence from functools import partial -from typing import Any - -import numpy as np -import torch -from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.encoder_interface import PromptType from mteb.model_meta import ModelMeta -from mteb.models.sentence_transformer_wrapper import SentenceTransformerWrapper +from mteb.models.instruct_wrapper import InstructSentenceTransformerWrapper logger = logging.getLogger(__name__) @@ -22,56 +16,6 @@ def instruction_template( return f"Instruct: {instruction}\nQuery: " if instruction else "" -class NvEmbedWrapper(SentenceTransformerWrapper): - def __init__( - self, - model: str | SentenceTransformer | CrossEncoder, - revision: str | None = None, - model_prompts: dict[str, str] | None = None, - **kwargs, - ) -> None: - super().__init__(model, revision, model_prompts, **kwargs) - self.model.max_seq_length = 32768 - self.model.tokenizer.padding_side = "right" - logger.warning( - "Instructions are used in both query and docs, which may cause performance discrepancies from the original implementation." - ) - - def encode( - self, - sentences: Sequence[str], - *, - task_name: str, - prompt_type: PromptType | None = None, - **kwargs: Any, - ) -> np.ndarray: - # Add eos token to each input example - sentences = [example + self.model.tokenizer.eos_token for example in sentences] - - instruction = "" - if prompt_type == PromptType.query: - instruction = self.get_instruction(task_name, prompt_type) - - prompt = instruction_template(instruction) - - if prompt: - logger.info(f"Using {prompt=} for task={task_name} {prompt_type=}") - else: - logger.info(f"No model prompts found for task={task_name} {prompt_type=}") - - logger.info(f"Encoding {len(sentences)} sentences.") - - embeddings = self.model.encode( - sentences, - prompt=prompt, - normalize_embeddings=True, - **kwargs, - ) - if isinstance(embeddings, torch.Tensor): - embeddings = embeddings.cpu().detach().float().numpy() - return embeddings - - nvidia_training_datasets = { # source: https://arxiv.org/pdf/2405.17428 "ArguAna": ["train"], @@ -120,11 +64,18 @@ def encode( "STSBenchmark": ["train"], "STSBenchmarkMultilingualSTS": ["train"], # translated, not trained on } + NV_embed_v2 = ModelMeta( loader=partial( # type: ignore - NvEmbedWrapper, + InstructSentenceTransformerWrapper, model="nvidia/NV-Embed-v2", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template=instruction_template, trust_remote_code=True, + max_seq_length=32768, + padding_side="right", + # for nv-embed, we add eos token to each input example + add_eos_token=True, ), name="nvidia/NV-Embed-v2", languages=["eng_Latn"], @@ -146,9 +97,15 @@ def encode( NV_embed_v1 = ModelMeta( loader=partial( # type: ignore - NvEmbedWrapper, + InstructSentenceTransformerWrapper, model="nvidia/NV-Embed-v1", + revision="7604d305b621f14095a1aa23d351674c2859553a", + instruction_template=instruction_template, trust_remote_code=True, + max_seq_length=32768, + padding_side="right", + # for nv-embed, we add eos token to each input example + add_eos_token=True, ), name="nvidia/NV-Embed-v1", languages=["eng_Latn"], From 3598b3746ed69af75afbfe2b03f0081d8b14923a Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 25 Jan 2025 16:50:17 +0000 Subject: [PATCH 005/275] 1.31.0 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2883e6596a..e989421883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.30.0" +version = "1.31.0" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 95714d06da6a06969e92526262d57f718e840c21 Mon Sep 17 00:00:00 2001 From: Sam <40773225+sam-hey@users.noreply.github.com> Date: Sun, 26 Jan 2025 08:03:43 +0100 Subject: [PATCH 006/275] doc: update pr template (#1871) * doc: update pr template * group testing & add: do not delete --------- Co-authored-by: sam021313 <40773225+sam021313@users.noreply.github.com> --- .github/pull_request_template.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index be1d1c7418..71ed2554f9 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -4,11 +4,18 @@ -## Checklist +### Code Quality +- [ ] **Code Formatted**: Format the code using `make lint` to maintain consistent style. -- [ ] Run tests locally to make sure nothing is broken using `make test`. -- [ ] Run the formatter to format the code using `make lint`. +### Documentation + +- [ ] **Updated Documentation**: Add or update documentation to reflect the changes introduced in this PR. + +### Testing + +- [ ] **New Tests Added**: Write tests to cover new functionality. Validate with `make test-with-coverage`. +- [ ] **Tests Passed**: Run tests locally using `make test` or `make test-with-coverage` to ensure no existing functionality is broken. ### Adding datasets checklist From 1d660892288d02379e67a59b94523410497ee20b Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Sun, 26 Jan 2025 10:21:58 +0300 Subject: [PATCH 007/275] fix: fix jina v1, 2 models (#1872) fix jina models --- mteb/models/gme_models.py | 1 - mteb/models/jina_models.py | 24 ++++++++++++++++++++++++ mteb/models/lens_models.py | 8 +------- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/mteb/models/gme_models.py b/mteb/models/gme_models.py index 53476403bc..804dfbc84d 100644 --- a/mteb/models/gme_models.py +++ b/mteb/models/gme_models.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging -from functools import partial from mteb.model_meta import ModelMeta diff --git a/mteb/models/jina_models.py b/mteb/models/jina_models.py index e855ad3c7a..00641e9c89 100644 --- a/mteb/models/jina_models.py +++ b/mteb/models/jina_models.py @@ -245,6 +245,12 @@ def encode( jina_embeddings_v2_base_en = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embeddings-v2-base-en", + revision="6e85f575bc273f1fd840a658067d0157933c83f0", + trust_remote_code=True, + ), name="jinaai/jina-embeddings-v2-base-en", languages=["eng-Latn"], open_weights=True, @@ -266,6 +272,12 @@ def encode( ) jina_embeddings_v2_small_en = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embeddings-v2-small-en", + revision="796cff318cdd4e5fbe8b7303a1ef8cbec36996ef", + trust_remote_code=True, + ), name="jinaai/jina-embeddings-v2-small-en", languages=["eng-Latn"], open_weights=True, @@ -287,6 +299,12 @@ def encode( ) jina_embedding_b_en_v1 = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embedding-b-en-v1", + revision="aa0645035294a8c0607ce5bb700aba982cdff32c", + trust_remote_code=True, + ), name="jinaai/jina-embedding-b-en-v1", languages=["eng-Latn"], open_weights=True, @@ -308,6 +326,12 @@ def encode( ) jina_embedding_s_en_v1 = ModelMeta( + loader=partial( + SentenceTransformerWrapper, + model_name="jinaai/jina-embedding-s-en-v1", + revision="c1fed70aa4823a640f1a7150a276e4d3b08dce08", + trust_remote_code=True, + ), name="jinaai/jina-embedding-s-en-v1", languages=["eng-Latn"], open_weights=True, diff --git a/mteb/models/lens_models.py b/mteb/models/lens_models.py index 2fe54b26a1..380724e53e 100644 --- a/mteb/models/lens_models.py +++ b/mteb/models/lens_models.py @@ -1,12 +1,6 @@ from __future__ import annotations -from functools import partial - -import torch - -from mteb.encoder_interface import PromptType -from mteb.model_meta import ModelMeta, sentence_transformers_loader -from mteb.models.instruct_wrapper import instruct_wrapper +from mteb.model_meta import ModelMeta from .bge_models import bge_full_data From 7e7571e0956fe75f19572560771261044b5faf59 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 26 Jan 2025 07:36:48 +0000 Subject: [PATCH 008/275] 1.31.1 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e989421883..f4a0111cd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.0" +version = "1.31.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From e6237714a1e340a0e07a8f121030e0277a8d5634 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 28 Jan 2025 11:06:02 +0100 Subject: [PATCH 009/275] fix: update voyage exp metadata (#1888) * fix: update voyage exp metadata * aded number of parameters --- mteb/models/voyage_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mteb/models/voyage_models.py b/mteb/models/voyage_models.py index a637dee36a..3bcfb997bf 100644 --- a/mteb/models/voyage_models.py +++ b/mteb/models/voyage_models.py @@ -361,17 +361,17 @@ def _batched_encode( voyage_3_exp = ModelMeta( name="voyageai/voyage-3-m-exp", revision="1", - release_date=None, # not released - languages=None, # supported languages not specified + release_date="2025-01-08", + languages=["eng-Latn"], loader=partial( VoyageWrapper, model_name="voyage-3-m-exp", model_prompts=model_prompts, ), max_tokens=32000, - embed_dim=512, + embed_dim=2048, open_weights=False, - n_parameters=None, + n_parameters=int(6918 * 1e6), license=None, reference="https://huggingface.co/voyageai/voyage-3-m-exp", similarity_fn_name="cosine", From 21c7390e8f0998b21d2793d9d7296b91136c63da Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 28 Jan 2025 10:11:28 +0000 Subject: [PATCH 010/275] 1.31.2 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f4a0111cd5..111e90a28a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.1" +version = "1.31.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 2a4173046a4b64c38c99132417abe60590dc0381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Tue, 28 Jan 2025 16:16:42 +0100 Subject: [PATCH 011/275] fix: External results are preferred when only they have the needed splits (#1893) join_revisions now prefers task_results where the scores are not empty --- mteb/leaderboard/app.py | 12 ++++++------ mteb/load_results/benchmark_results.py | 3 +++ 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 5ee5a6b9da..31ff87adef 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -20,15 +20,15 @@ logger = logging.getLogger(__name__) +ALL_MODELS = {meta.name for meta in mteb.get_model_metas()} + def load_results(): results_cache_path = Path(__file__).parent.joinpath("__cached_results.json") if not results_cache_path.exists(): - all_results = ( - mteb.load_results(only_main_score=True, require_model_meta=False) - .join_revisions() - .filter_models() - ) + all_results = mteb.load_results( + only_main_score=True, require_model_meta=False, models=ALL_MODELS + ).filter_models() all_results.to_disk(results_cache_path) return all_results else: @@ -168,7 +168,7 @@ def filter_models( benchmarks = mteb.get_benchmarks() all_benchmark_results = { - benchmark.name: benchmark.load_results(base_results=all_results) + benchmark.name: benchmark.load_results(base_results=all_results).join_revisions() for benchmark in benchmarks } default_benchmark = mteb.get_benchmark(DEFAULT_BENCHMARK_NAME) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index e1632a3dec..c2c3b76ea8 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -259,6 +259,8 @@ def parse_version(version_str: str) -> Version | None: return None def keep_best(group: pd.DataFrame) -> pd.DataFrame: + # Filtering out task_results where no scores are present + group = group[group["has_scores"]] is_main_revision = group["revision"] == group["main_revision"] # If the main revision is present we select that if is_main_revision.sum() > 0: @@ -286,6 +288,7 @@ def keep_best(group: pd.DataFrame) -> pd.DataFrame: task_name=task_result.task_name, mteb_version=task_result.mteb_version, task_result=task_result, + has_scores=bool(task_result.scores), ) ) task_df = pd.DataFrame.from_records(records) From e1be4388cd33dc1d42c7c657bc00d7acbf6a3b29 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 28 Jan 2025 15:24:34 +0000 Subject: [PATCH 012/275] 1.31.3 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 111e90a28a..2360471341 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.2" +version = "1.31.3" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 8fb59a49b00e7932abec42c045c1cc068c7eba41 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 29 Jan 2025 11:53:46 +0100 Subject: [PATCH 013/275] fix: Allow aggregated tasks within benchmarks (#1771) * fix: Allow aggregated tasks within benchmarks Fixes #1231 * feat: Update task filtering, fixing bug on MTEB - Updated task filtering adding exclusive_language_filter and hf_subset - fix bug in MTEB where cross-lingual splits were included - added missing language filtering to MTEB(europe, beta) and MTEB(indic, beta) The following code outlines the problems: ```py import mteb from mteb.benchmarks import MTEB_ENG_CLASSIC task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] # was eq. to: task = mteb.get_task("STS22", languages=["eng"]) task.hf_subsets # correct filtering to English datasets: # ['en', 'de-en', 'es-en', 'pl-en', 'zh-en'] # However it should be: # ['en'] # with the changes it is: task = [t for t in MTEB_ENG_CLASSIC.tasks if t.metadata.name == "STS22"][0] task.hf_subsets # ['en'] # eq. to task = mteb.get_task("STS22", hf_subsets=["en"]) # which you can also obtain using the exclusive_language_filter (though not if there was multiple english splits): task = mteb.get_task("STS22", languages=["eng"], exclusive_language_filter=True) ``` * format * remove "en-ext" from AmazonCounterfactualClassification * fixed mteb(deu) * fix: simplify in a few areas * wip * tmp * sav * Allow aggregated tasks within benchmarks Fixes #1231 * ensure correct formatting of eval_langs * ignore aggregate dataset * clean up dummy cases * add to mteb(eng, classic) * format * clean up * Allow aggregated tasks within benchmarks Fixes #1231 * added fixed from comments * fix merge * format * Updated task type * Added minor fix for dummy tasks --- .gitignore | 3 +- mteb/abstasks/AbsTask.py | 3 +- mteb/abstasks/AbsTaskBitextMining.py | 2 +- mteb/abstasks/TaskMetadata.py | 25 ++- mteb/abstasks/aggregate_task_metadata.py | 172 ++++++++++++++++++ mteb/abstasks/aggregated_task.py | 149 +++++++++++++++ mteb/benchmarks/benchmarks.py | 13 +- mteb/create_meta.py | 8 +- mteb/evaluation/MTEB.py | 42 ++++- mteb/leaderboard/app.py | 4 +- mteb/load_results/benchmark_results.py | 20 +- mteb/load_results/task_results.py | 43 ++--- mteb/tasks/__init__.py | 1 + .../aggregated_tasks/CQADupStackRetrieval.py | 62 +++++++ mteb/tasks/aggregated_tasks/__init__.py | 5 + tests/test_TaskMetadata.py | 1 + tests/test_overview.py | 2 +- tests/test_tasks/test_all_abstasks.py | 3 + 18 files changed, 484 insertions(+), 74 deletions(-) create mode 100644 mteb/abstasks/aggregate_task_metadata.py create mode 100644 mteb/abstasks/aggregated_task.py create mode 100644 mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py create mode 100644 mteb/tasks/aggregated_tasks/__init__.py diff --git a/.gitignore b/.gitignore index 977fe8dc1a..d5cc51748b 100644 --- a/.gitignore +++ b/.gitignore @@ -147,4 +147,5 @@ results/ uv.lock # model loading tests -model_names.txt \ No newline at end of file +model_names.txt +mteb/leaderboard/__cached_results.json diff --git a/mteb/abstasks/AbsTask.py b/mteb/abstasks/AbsTask.py index 1ec1ebc4fc..c0368dcc54 100644 --- a/mteb/abstasks/AbsTask.py +++ b/mteb/abstasks/AbsTask.py @@ -63,7 +63,7 @@ class AbsTask(ABC): dataset: dict[HFSubset, DatasetDict] | None = None # type: ignore data_loaded: bool = False is_multilingual: bool = False - hf_subsets: list[HFSubset] | None = None + hf_subsets: list[HFSubset] def __init__(self, seed: int = 42, **kwargs: Any): self.save_suffix = kwargs.get("save_suffix", "") @@ -73,6 +73,7 @@ def __init__(self, seed: int = 42, **kwargs: Any): np.random.seed(self.seed) torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) + self.hf_subsets = list(self.metadata.hf_subsets_to_langscripts.keys()) def check_if_dataset_is_superseded(self): """Check if the dataset is superseded by a newer version""" diff --git a/mteb/abstasks/AbsTaskBitextMining.py b/mteb/abstasks/AbsTaskBitextMining.py index 1c373cc2f7..b8105dc141 100644 --- a/mteb/abstasks/AbsTaskBitextMining.py +++ b/mteb/abstasks/AbsTaskBitextMining.py @@ -71,7 +71,7 @@ def evaluate( subsets_to_run: list[HFSubset] | None = None, *, encode_kwargs: dict[str, Any] = {}, - **kwargs, + **kwargs: Any, ) -> dict[HFSubset, ScoresDict]: if not self.data_loaded: self.load_data() diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index b04753d877..f0f8a66143 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -82,8 +82,8 @@ "machine-translated and verified", "machine-translated and localized", "LM-generated and verified", + "multiple", ] - TASK_TYPE = Literal[ "BitextMining", "Classification", @@ -98,6 +98,7 @@ "Speed", ] + TASK_CATEGORY = Literal[ "s2s", # Sentence-to-sentence "s2p", # Sentence-to-paragraph @@ -169,9 +170,10 @@ "gpl-3.0", "cdla-sharing-1.0", "mpl-2.0", + "multiple", ] ) - +MODALITIES = Literal["text"] METRIC_NAME = str METRIC_VALUE = Union[int, float, dict[str, Any]] @@ -228,13 +230,13 @@ class TaskMetadata(BaseModel): bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available. """ - dataset: dict + dataset: dict[str, Any] name: str description: str prompt: str | PromptDict | None = None type: TASK_TYPE - modalities: list[Literal["text"]] = ["text"] + modalities: list[MODALITIES] = ["text"] category: TASK_CATEGORY | None = None reference: STR_URL | None = None @@ -335,6 +337,15 @@ def _check_language_code(code): f"Invalid script code: {script}, you can find valid ISO 15924 codes in {path_to_lang_scripts}" ) + @property + def bcp47_codes(self) -> list[ISO_LANGUAGE_SCRIPT]: + """Return the languages and script codes of the dataset formatting in accordance with the BCP-47 standard.""" + if isinstance(self.eval_langs, dict): + return sorted( + {lang for langs in self.eval_langs.values() for lang in langs} + ) + return sorted(set(self.eval_langs)) + @property def languages(self) -> list[str]: """Return the languages of the dataset as iso639-3 codes.""" @@ -421,8 +432,12 @@ def n_samples(self) -> dict[str, int] | None: for subset, subset_value in stats.items(): if subset == "hf_subset_descriptive_stats": continue - n_samples[subset] = subset_value["num_samples"] + n_samples[subset] = subset_value["num_samples"] # type: ignore return n_samples def __hash__(self) -> int: return hash(self.model_dump_json()) + + @property + def revision(self) -> str: + return self.dataset["revision"] diff --git a/mteb/abstasks/aggregate_task_metadata.py b/mteb/abstasks/aggregate_task_metadata.py new file mode 100644 index 0000000000..106419b752 --- /dev/null +++ b/mteb/abstasks/aggregate_task_metadata.py @@ -0,0 +1,172 @@ +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Any + +from pydantic import ConfigDict, model_validator + +from mteb.abstasks.AbsTask import AbsTask +from mteb.abstasks.TaskMetadata import ( + ANNOTATOR_TYPE, + LANGUAGES, + LICENSES, + MODALITIES, + SAMPLE_CREATION_METHOD, + STR_DATE, + TASK_DOMAIN, + TASK_SUBTYPE, + TASK_TYPE, + HFSubset, + TaskMetadata, +) +from mteb.languages import ISO_LANGUAGE_SCRIPT + +logger = logging.getLogger(__name__) + + +class AggregateTaskMetadata(TaskMetadata): + """Metadata for an aggregation of tasks. This description only covers exceptions to the TaskMetadata. Many of the field if not filled out will be + autofilled from its tasks. + + Attributes: + name: The name of the aggregated task. + description: A description of the task. Should explain the aggregation. + prompt: An aggregate task does not have a prompt, thus this value is always None. + dataset: The dataset for the aggregated task is specified in its tasks. The aggregate task thus only specified the revision and uses a + placeholder path. + tasks: A list of tasks, the majority of the metadata is described within its tasks. + eval_splits: The splits of the tasks used for evaluation. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + name: str + description: str + dataset: dict[str, Any] = { + "path": "aggregate tasks do not have a path", # just a place holder + "revision": "1", + } + + tasks: list[AbsTask] + main_score: str + type: TASK_TYPE + eval_splits: list[str] + eval_langs: LANGUAGES = [] + prompt: None = None + reference: str | None = None + bibtex_citation: str | None = None + + @property + def hf_subsets_to_langscripts(self) -> dict[HFSubset, list[ISO_LANGUAGE_SCRIPT]]: + """Return a dictionary mapping huggingface subsets to languages.""" + return {"default": self.eval_langs} # type: ignore + + @model_validator(mode="after") # type: ignore + def compute_unfilled_cases(self) -> AggregateTaskMetadata: + if not self.eval_langs: + self.eval_langs = self.compute_eval_langs() + if not self.date: + self.date = self.compute_date() + if not self.domains: + self.domains = self.compute_domains() + if not self.task_subtypes: + self.task_subtypes = self.compute_task_subtypes() + if not self.license: + self.license = self.compute_license() + if not self.annotations_creators: + self.annotations_creators = self.compute_annotations_creators() + if not self.dialect: + self.dialect = self.compute_dialect() + if not self.sample_creation: + self.sample_creation = self.compute_sample_creation() + if not self.modalities: + self.modalities = self.compute_modalities() + + return self + + def compute_eval_langs(self) -> list[ISO_LANGUAGE_SCRIPT]: + langs = set() + for task in self.tasks: + langs.update(set(task.metadata.bcp47_codes)) + return list(langs) + + def compute_date(self) -> tuple[STR_DATE, STR_DATE] | None: + # get min max date from tasks + dates = [] + for task in self.tasks: + if task.metadata.date: + dates.append(datetime.fromisoformat(task.metadata.date[0])) + dates.append(datetime.fromisoformat(task.metadata.date[1])) + + if not dates: + return None + + min_date = min(dates) + max_date = max(dates) + return min_date.isoformat(), max_date.isoformat() + + def compute_domains(self) -> list[TASK_DOMAIN] | None: + domains = set() + for task in self.tasks: + if task.metadata.domains: + domains.update(set(task.metadata.domains)) + if domains: + return list(domains) + return None + + def compute_task_subtypes(self) -> list[TASK_SUBTYPE] | None: + subtypes = set() + for task in self.tasks: + if task.metadata.task_subtypes: + subtypes.update(set(task.metadata.task_subtypes)) + if subtypes: + return list(subtypes) + return None + + def compute_license(self) -> LICENSES | None: + licenses = set() + for task in self.tasks: + if task.metadata.license: + licenses.add(task.metadata.license) + if len(licenses) > 1: + return "multiple" + return None + + def compute_annotations_creators(self) -> ANNOTATOR_TYPE | None: + creators = set() + for task in self.tasks: + if task.metadata.annotations_creators: + creators.add(task.metadata.annotations_creators) + if len(creators) > 1: + logger.warning( + f"Multiple annotations_creators found for tasks in {self.name}. Using None as annotations_creators." + ) + return None + + def compute_dialect(self) -> list[str] | None: + dialects = set() + for task in self.tasks: + if task.metadata.dialect: + dialects.update(set(task.metadata.dialect)) + if dialects: + return list(dialects) + return None + + def compute_sample_creation(self) -> SAMPLE_CREATION_METHOD | None: + sample_creations = set() + for task in self.tasks: + if task.metadata.sample_creation: + sample_creations.add(task.metadata.sample_creation) + if len(sample_creations) > 1: + return "multiple" + return None + + def compute_modalities(self) -> list[MODALITIES]: + modalities = set() + for task in self.tasks: + if task.metadata.modalities: + modalities.update(set(task.metadata.modalities)) + if modalities: + return list(modalities) + return None diff --git a/mteb/abstasks/aggregated_task.py b/mteb/abstasks/aggregated_task.py new file mode 100644 index 0000000000..255df2000f --- /dev/null +++ b/mteb/abstasks/aggregated_task.py @@ -0,0 +1,149 @@ +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING, Any + +import numpy as np + +from mteb.abstasks.AbsTask import AbsTask +from mteb.abstasks.aggregate_task_metadata import AggregateTaskMetadata + +if TYPE_CHECKING: + from datasets import Dataset, DatasetDict + + from mteb.abstasks.TaskMetadata import DescriptiveStatistics, HFSubset + from mteb.encoder_interface import Encoder + from mteb.load_results.task_results import TaskResult + + from .AbsTask import ScoresDict + +logger = logging.getLogger(__name__) + + +class AbsTaskAggregate(AbsTask): + metadata: AggregateTaskMetadata + superseded_by: None | str = None + hf_subset = "default" # since there is no subset we use the "default" naming scheme + _eval_splits: list[str] | None = None + + def __init__(self, **kwargs: Any): + self.tasks = self.metadata.tasks + self.taskname_to_task = {task.metadata.name: task for task in self.tasks} + + def task_results_to_scores( + self, task_results: list[TaskResult] + ) -> dict[str, dict[HFSubset, ScoresDict]]: + """The function that aggregated scores. Can be redefined to allow for custom aggregations.""" + scores = {} + for split in self.metadata.eval_splits: + main_scores = [] + for task_res in task_results: + main_scores.append( + task_res.get_score_fast( + languages=None, + splits=self.metadata.eval_splits, + ) + ) + main_score = np.mean(main_scores) + scores[split] = { + "default": { + self.metadata.main_score: main_score, + "main_score": main_score, + } + } + return scores + + def combine_task_results(self, task_results: list[TaskResult]) -> TaskResult: + """Combined the task results for using `task_results_to_scores`. Do not redefine this function if you want to implement a custom aggregation. + Instead redefin `task_results_to_scores`. + """ + from mteb.load_results.task_results import ( + TaskResult, # to prevent circular imports, # TODO: can potentially likely be out of function in in v2.0.0 + ) + + eval_times = [tr.evaluation_time for tr in task_results if tr.evaluation_time] + if len(eval_times) != len(task_results): + logger.info( + f"Loaded results does not include runtime. Therefor evaluation of {self.metadata.name} " + + "can't be computed. Setting it to None." + ) + eval_time = np.nan + else: + eval_time = sum(eval_times) + + kg_co2_emissions_ = [ + tr.kg_co2_emissions for tr in task_results if tr.kg_co2_emissions + ] + if len(kg_co2_emissions_) != len(task_results): + logger.info( + f"Loaded results does not include co2-eq emissions. Therefor evaluation of {self.metadata.name} " + + "can't be computed. Setting it to None." + ) + kg_co2_emissions = np.nan + else: + kg_co2_emissions = sum(kg_co2_emissions_) + + task_res = TaskResult.from_task_results( + self, + scores=self.task_results_to_scores(task_results), + evaluation_time=eval_time, + kg_co2_emissions=kg_co2_emissions, + ) + mteb_versions = {tr.mteb_version for tr in task_results} + if len(mteb_versions) != 1: + logger.warning( + f"All tasks of {self.metadata.name} is not run using the same version." + ) + task_res.mteb_version = None + task_res.mteb_version = task_results[0].mteb_version + return task_res + + def check_if_dataset_is_superseded(self): + """Check if the dataset is superseded by a newer version""" + if self.superseded_by: + logger.warning( + f"Dataset '{self.metadata.name}' is superseded by '{self.superseded_by}', you might consider using the newer version of the dataset." + ) + + def filter_eval_splits(self, eval_splits: list[str] | None) -> AbsTaskAggregate: + """Filter the evaluation splits of the task.""" + self._eval_splits = eval_splits + return self + + def evaluate( + self, + model: Encoder, + split: str = "test", + subsets_to_run: list[HFSubset] | None = None, + *, + encode_kwargs: dict[str, Any] = {}, + **kwargs: Any, + ) -> dict[HFSubset, ScoresDict]: + # TODO: If we refactor the runner to at least have a subfunction mteb.run_task(model, task) we could use that here + raise NotImplementedError( + "Aggregate tasks can't be evaluated directly. Instead run it using the MTEB class." + ) + + def _evaluate_subset( + self, + model: Encoder, + data_split: DatasetDict | Dataset, + encode_kwargs: dict[str, Any], + **kwargs: Any, + ) -> ScoresDict: + raise NotImplementedError( + "Aggregate tasks does not implement a _evaluate_subset. Instead use the individual tasks." + ) + + def _calculate_metrics_from_split( + self, split: str, hf_subset: str | None = None, compute_overall: bool = False + ) -> DescriptiveStatistics: + raise NotImplementedError( + "Aggregate tasks does not implement a _calculate_metrics_from_split. Instead use the individual tasks." + ) + + @property + def eval_splits(self) -> list[str]: + if self._eval_splits: + return self._eval_splits + return self.metadata.eval_splits diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 415b11eddb..c1e5236ecf 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -154,18 +154,7 @@ def load_results( "Banking77Classification", "BiorxivClusteringP2P", "BiorxivClusteringS2S", - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval", + "CQADupstackRetrieval", "ClimateFEVER", "DBPedia", "EmotionClassification", diff --git a/mteb/create_meta.py b/mteb/create_meta.py index 02ed273996..bf7402613d 100644 --- a/mteb/create_meta.py +++ b/mteb/create_meta.py @@ -8,7 +8,6 @@ import mteb from mteb import TaskResult -from mteb.load_results.task_results import CQADupstackRetrievalDummy def generate_readme(results_folder: Path, from_existing: Path | None = None) -> str: @@ -46,12 +45,7 @@ def load_model_name(results_folder: Path) -> str: def process_task_result(task_result: TaskResult) -> list[dict[str, Any]]: - # CQADupstackRetrieval is a combined dataset (special case atm.) - task = ( - CQADupstackRetrievalDummy() - if task_result.task_name == "CQADupstackRetrieval" - else mteb.get_task(task_result.task_name) - ) + task = mteb.get_task(task_result.task_name) yaml_results = [] for split, hf_subset_scores in task_result.scores.items(): diff --git a/mteb/evaluation/MTEB.py b/mteb/evaluation/MTEB.py index 70378931c2..bcef789174 100644 --- a/mteb/evaluation/MTEB.py +++ b/mteb/evaluation/MTEB.py @@ -4,7 +4,7 @@ import logging import os import traceback -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from copy import copy, deepcopy from datetime import datetime from itertools import chain @@ -16,6 +16,7 @@ from sentence_transformers import CrossEncoder, SentenceTransformer from mteb.abstasks.AbsTask import ScoresDict +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.encoder_interface import Encoder from mteb.model_meta import ModelMeta from mteb.models import model_meta_from_sentence_transformers @@ -31,9 +32,12 @@ class MTEB: + _tasks: Iterable[str | AbsTask] | None + tasks: list[AbsTask] + def __init__( self, - tasks: Iterable[str | AbsTask] | None = None, + tasks: Sequence[str | AbsTask] | None = None, *, task_types: list[str] | None = None, task_categories: list[str] | None = None, @@ -61,12 +65,11 @@ def __init__( self.deprecation_warning( task_types, task_categories, task_langs, tasks, version ) - if tasks is not None: self._tasks = tasks if isinstance(tasks[0], Benchmark): self.benchmarks = tasks - self._tasks = list(chain.from_iterable(tasks)) + self._tasks = self._tasks = list(chain.from_iterable(tasks)) # type: ignore assert ( task_types is None and task_categories is None ), "Cannot specify both `tasks` and `task_types`/`task_categories`" @@ -253,7 +256,7 @@ def select_tasks(self, **kwargs): f"WARNING: Unknown tasks: {unknown_str}. Known tasks: {known_str}." ) # add task if subclass of mteb.tasks - self.tasks.extend([x for x in self._tasks if isinstance(x, AbsTask)]) + self.tasks.extend([x for x in self._tasks if isinstance(x, (AbsTask))]) return # Otherwise use filters to select tasks @@ -463,6 +466,29 @@ def run( f"\n\n********************** Evaluating {task.metadata.name} **********************" ) + if isinstance(task, AbsTaskAggregate): + self_ = MTEB(tasks=task.metadata.tasks) + task_results = self_.run( + model, + verbosity=verbosity - 1, + output_folder=output_folder, + eval_splits=eval_splits, + eval_subsets=eval_subsets, + overwrite_results=overwrite_results, + raise_error=raise_error, + co2_tracker=co2_tracker, + encode_kwargs=encode_kwargs, + **kwargs, + ) + new_results = task.combine_task_results(task_results) + evaluation_results.append(new_results) + + if output_path: + save_path = output_path / f"{task.metadata.name}.json" + new_results.to_disk(save_path) + del self.tasks[0] + continue + if "bm25s" in meta.name and task.metadata.type != "Retrieval": logger.warning( f"bm25s only supports Retrieval tasks, but the task type is {task.metadata.type}. Skipping task." @@ -473,7 +499,11 @@ def run( task_eval_splits = ( eval_splits if eval_splits is not None else task.eval_splits ) - task_subsets = list(task.metadata.hf_subsets_to_langscripts.keys()) + task_subsets = ( + task.hf_subsets + if task.hf_subsets + else list(task.metadata.hf_subsets_to_langscripts.keys()) + ) existing_results = None save_path = None diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 31ff87adef..483b681e23 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -5,6 +5,7 @@ import logging import tempfile import time +import typing from pathlib import Path from typing import Literal from urllib.parse import urlencode @@ -14,6 +15,7 @@ from gradio_rangeslider import RangeSlider import mteb +from mteb.abstasks.TaskMetadata import TASK_TYPE from mteb.caching import json_cache from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables @@ -206,7 +208,7 @@ def filter_models( ) type_select = gr.Dropdown( all_results.task_types, - value=sorted(default_results.task_types), + value=sorted(typing.get_args(TASK_TYPE)), multiselect=True, label="Task Type", info="Select task types to include.", diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index c2c3b76ea8..caece1b2b4 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -3,7 +3,7 @@ import json import warnings from collections import defaultdict -from collections.abc import Iterable +from collections.abc import Iterable, Sequence from pathlib import Path from typing import Any, Callable, Literal @@ -69,7 +69,7 @@ def filter_tasks( task_results=new_task_results, ) - def select_tasks(self, tasks: list[AbsTask]) -> ModelResult: + def select_tasks(self, tasks: Sequence[AbsTask]) -> ModelResult: task_name_to_task = {task.metadata.name: task for task in tasks} new_task_results = [ task_res.validate_and_filter_scores(task_name_to_task[task_res.task_name]) @@ -105,15 +105,15 @@ def get_scores( try: if use_fast: scores[res.task_name] = res.get_score_fast( - splits=splits, - languages=languages, + splits=splits, # type: ignore + languages=languages, # type: ignore ) else: scores[res.task_name] = res.get_score( splits=splits, languages=languages, - aggregation=aggregation, - getter=getter, + aggregation=aggregation, # type: ignore + getter=getter, # type: ignore scripts=scripts, ) except Exception as e: @@ -216,7 +216,7 @@ def filter_tasks( model_results=[res for res in model_results if res.task_results] ) - def select_tasks(self, tasks: list[AbsTask]) -> BenchmarkResults: + def select_tasks(self, tasks: Sequence[AbsTask]) -> BenchmarkResults: new_model_results = [ model_res.select_tasks(tasks) for model_res in self.model_results ] @@ -317,8 +317,8 @@ def get_scores( splits: list[Split] | None = None, languages: list[ISO_LANGUAGE | ISO_LANGUAGE_SCRIPT] | None = None, scripts: list[ISO_LANGUAGE_SCRIPT] | None = None, - getter: Callable[[ScoresDict], Score] = None, - aggregation: Callable[[list[Score]], Any] = None, + getter: Callable[[ScoresDict], Score] | None = None, + aggregation: Callable[[list[Score]], Any] | None = None, format: Literal["wide", "long"] = "wide", ) -> list[dict]: entries = [] @@ -393,7 +393,7 @@ def to_dict(self) -> dict: return self.model_dump() @classmethod - def from_dict(cls, data: dict) -> TaskResult: + def from_dict(cls, data: dict) -> BenchmarkResults: return cls.model_validate(data) def to_disk(self, path: Path | str) -> None: diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 72cae5a93d..4ff2406934 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -4,6 +4,7 @@ import logging from argparse import Namespace from collections import defaultdict +from collections.abc import Iterable from functools import cached_property from importlib.metadata import version from pathlib import Path @@ -23,24 +24,6 @@ logger = logging.getLogger(__name__) -# Tasks that were completely removed from the MTEB (we generally don't do this anymore instead we supersede tasks) -class CQADupstackRetrievalDummy: - """A dummy task for loading historic results from before v1.11.0""" - - metadata = Namespace( # type: ignore - name="CQADupstackRetrieval", - main_score="ndcg_at_10", - type="Retrieval", - hf_subsets_to_langscripts={ - "default": ["eng-Latn"], - }, - dataset={ - "revision": "revision not applicable", - "path": "CQADupstackRetrieval_is_a_combined_dataset", - }, - ) - - class ScalaNbClassificationDummy: """A dummy task for loading historic results from before v1.11.0""" @@ -52,6 +35,7 @@ class ScalaNbClassificationDummy: "default": ["nob-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -66,6 +50,7 @@ class ScalaNnClassificationDummy: "default": ["nno-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -80,6 +65,7 @@ class ScalaDaClassificationDummy: "default": ["dan-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) @@ -94,11 +80,11 @@ class ScalaSvClassificationDummy: "default": ["swe-Latn"], }, dataset={"revision": "revision_not_applicable"}, + revision="revision_not_applicable", ) outdated_tasks = { - "CQADupstackRetrieval": CQADupstackRetrievalDummy, "ScalaNbClassification": ScalaNbClassificationDummy, "ScalaNnClassification": ScalaNnClassificationDummy, "ScalaDaClassification": ScalaDaClassificationDummy, @@ -183,7 +169,7 @@ def from_task_results( flat_scores[split].append(_scores) return TaskResult( - dataset_revision=task.metadata.dataset["revision"], + dataset_revision=task.metadata.revision, task_name=task.metadata.name, mteb_version=version("mteb"), scores=flat_scores, @@ -471,10 +457,12 @@ def get_score( return aggregation(values) - def get_score_fast(self, splits: str | None, languages: str | None) -> float: + def get_score_fast( + self, splits: Iterable[str] | None = None, languages: str | None = None + ) -> float: """Sped up version of get_score that will be used if no aggregation, script or getter needs to be specified.""" if splits is None: - splits = self.scores + splits = self.scores.keys() val_sum = 0 n_val = 0 for split in splits: @@ -536,14 +524,11 @@ def validate_and_filter_scores(self, task: AbsTask | None = None) -> TaskResult: if task is None: task = get_task(self.task_name) + splits = task.metadata.eval_splits - if task.is_multilingual: - hf_subsets = getattr( - task, "hf_subsets", task.metadata.hf_subsets_to_langscripts.keys() - ) - hf_subsets = set(hf_subsets) - else: - hf_subsets = {"default"} + hf_subsets = task.hf_subsets + hf_subsets = set(hf_subsets) + new_scores = {} seen_splits = set() for split in self.scores: diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index dfe568bb89..8a53eb0d7d 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .aggregated_tasks import * from .BitextMining import * from .Classification import * from .Clustering import * diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py new file mode 100644 index 0000000000..917a667eb3 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrieval.py @@ -0,0 +1,62 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Retrieval import ( + CQADupstackAndroidRetrieval, + CQADupstackEnglishRetrieval, + CQADupstackGamingRetrieval, + CQADupstackGisRetrieval, + CQADupstackMathematicaRetrieval, + CQADupstackPhysicsRetrieval, + CQADupstackProgrammersRetrieval, + CQADupstackStatsRetrieval, + CQADupstackTexRetrieval, + CQADupstackUnixRetrieval, + CQADupstackWebmastersRetrieval, + CQADupstackWordpressRetrieval, +) + +task_list_cqa: list[AbsTask] = [ + CQADupstackAndroidRetrieval(), + CQADupstackEnglishRetrieval(), + CQADupstackGamingRetrieval(), + CQADupstackGisRetrieval(), + CQADupstackMathematicaRetrieval(), + CQADupstackPhysicsRetrieval(), + CQADupstackProgrammersRetrieval(), + CQADupstackStatsRetrieval(), + CQADupstackTexRetrieval(), + CQADupstackUnixRetrieval(), + CQADupstackWebmastersRetrieval(), + CQADupstackWordpressRetrieval(), +] + + +class CQADupstackRetrieval(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="CQADupstackRetrieval", + description="CQADupStack: A Benchmark Data Set for Community Question-Answering Research", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + tasks=task_list_cqa, + main_score="ndcg_at_10", + type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" + eval_splits=["test"], + bibtex_citation="""@inproceedings{hoogeveen2015, +author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, +title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, +booktitle = {Proceedings of the 20th Australasian Document Computing Symposium (ADCS)}, +series = {ADCS '15}, +year = {2015}, +isbn = {978-1-4503-4040-3}, +location = {Parramatta, NSW, Australia}, +pages = {3:1--3:8}, +articleno = {3}, +numpages = {8}, +url = {http://doi.acm.org/10.1145/2838931.2838934}, +doi = {10.1145/2838931.2838934}, +acmid = {2838934}, +publisher = {ACM}, +address = {New York, NY, USA}, +}""", + ) diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py new file mode 100644 index 0000000000..15025208cc --- /dev/null +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .CQADupStackRetrieval import CQADupstackRetrieval + +__all__ = ["CQADupstackRetrieval"] diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 2b606c2c19..09df66628a 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -59,6 +59,7 @@ "AILAStatutes", "ArguAna", "ClimateFEVER", + "CQADupstackRetrieval", "CQADupstackAndroidRetrieval", "CQADupstackEnglishRetrieval", "CQADupstackGamingRetrieval", diff --git a/tests/test_overview.py b/tests/test_overview.py index 127e54f279..7041328a59 100644 --- a/tests/test_overview.py +++ b/tests/test_overview.py @@ -20,7 +20,7 @@ def test_get_tasks_size_differences(): ) -@pytest.mark.parametrize("task_name", ["BornholmBitextMining"]) +@pytest.mark.parametrize("task_name", ["BornholmBitextMining", "CQADupstackRetrieval"]) @pytest.mark.parametrize("eval_splits", [["test"], None]) def test_get_task(task_name: str, eval_splits: list[str] | None): task = get_task(task_name, eval_splits=eval_splits) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index e252293ea5..5e310dfbb6 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -13,6 +13,7 @@ from mteb.abstasks.AbsTaskInstructionRetrieval import AbsTaskInstructionRetrieval from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.abstasks.MultiSubsetLoader import MultiSubsetLoader from mteb.overview import TASKS_REGISTRY @@ -86,6 +87,8 @@ async def check_datasets_are_available_on_hf(tasks): def test_dataset_availability(): """Checks if the datasets are available on Hugging Face using both their name and revision.""" tasks = MTEB().tasks_cls + # do not check aggregated tasks as they don't have a dataset + tasks = [t for t in tasks if not isinstance(t, AbsTaskAggregate)] tasks = [ t for t in tasks From d939627d27273b7b740e8ade5287f074aa6c0303 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 10:55:52 +0000 Subject: [PATCH 014/275] Update tasks table --- docs/tasks.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 15b9474168..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,6 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | @@ -930,7 +931,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | | emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 107 | 13 | 2 | 1 | 343 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 344 | | enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1702,7 +1703,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1395 | 812 | 306 | 3 | 28 | 84 | 51 | 475 | 85 | 2 | 2 | +| Total | None | None | None | 1395 | 812 | 306 | 3 | 28 | 84 | 51 | 476 | 85 | 2 | 2 | From 1ead72f3f03d3c858aca69216d031f95e64140dc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 10:58:03 +0000 Subject: [PATCH 015/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From cebf5b6d81981662af05762c8dffdb1a9bd5bf1a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:00:25 +0000 Subject: [PATCH 016/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..ba5b3675b7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 8bc101f9e6933a197a272a3189c92d85bc9b00bd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:02:47 +0000 Subject: [PATCH 017/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba5b3675b7..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From e487eff88f6a3460ae553b5fbef8bb552cfea92d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:04:52 +0000 Subject: [PATCH 018/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 24d5373d5da6ffdd90085629c2da8192e1de935a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:07:15 +0000 Subject: [PATCH 019/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..ba5b3675b7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 2689cb84f40d2ac936942ea1590aec7dc5f63825 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:09:25 +0000 Subject: [PATCH 020/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba5b3675b7..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 50f305fcb286969bcda9a110a4a9236c5f8a7157 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:13:44 +0000 Subject: [PATCH 021/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..59a263dcb9 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From e8b8ac08952c4569fd3b8cff3218e3ee9e3f25e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:16:57 +0000 Subject: [PATCH 022/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 59a263dcb9..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From c77c82c0647e5f24d3035639da8a393193d08461 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:20:39 +0000 Subject: [PATCH 023/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 02f8ad57a847c39bc1cdf1bdf7fbae3868ba0e5d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:22:39 +0000 Subject: [PATCH 024/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 3ee07855c30b4419b4bf2fab934f2282141eaa9a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:25:00 +0000 Subject: [PATCH 025/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From ce6e6666ecaafc12902d5fca2322b33a0a5819e1 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 11:29:57 +0000 Subject: [PATCH 026/275] 1.31.4 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2360471341..38408a360d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.3" +version = "1.31.4" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 092688df377e9dbb9080170f1462ef09a3818fbd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 11:32:12 +0000 Subject: [PATCH 027/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From cec0ed472fc762722bce24dfde6cb331f7006dee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1rton=20Kardos?= Date: Wed, 29 Jan 2025 14:26:24 +0100 Subject: [PATCH 028/275] fix: Limited plotly version to be less than 6.0.0 (#1902) Limited plotly version to be less than 6.0.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 38408a360d..258ed4c7c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0"] +leaderboard = ["gradio>=5.7.1", "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"] From a91d268014a8ce0a8791838a4c2709e32229a315 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:28:33 +0000 Subject: [PATCH 029/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..ba5b3675b7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From d8bf18b2f4605fca6455c4761bc3235789ce6371 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:30:35 +0000 Subject: [PATCH 030/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba5b3675b7..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From d005797a765d4f031f5720146b0bc92ecdb8b5dd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:32:50 +0000 Subject: [PATCH 031/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..ba5b3675b7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 251142ecdd3633985df07721482d38bac1d5896f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:35:05 +0000 Subject: [PATCH 032/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba5b3675b7..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From da0861794b3a0a2b47bddead9e1134ea05e83e11 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:39:20 +0000 Subject: [PATCH 033/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 93f23c41fd3544499f00c95ea2b03a1e8e1c6d21 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:41:30 +0000 Subject: [PATCH 034/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From a764fd7ea2c1a9f1cb72a5dd21b67bbab6b14bad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:45:09 +0000 Subject: [PATCH 035/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 086125475e647a6a4cb43bed863dd494671f34a0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:47:57 +0000 Subject: [PATCH 036/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 976bdd5682584fb892935d69efbcda315fd6364e Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Wed, 29 Jan 2025 16:49:28 +0300 Subject: [PATCH 037/275] update stella/jasper metainfo (#1896) update stella meta --- mteb/models/jasper_models.py | 17 +++++++++++++---- mteb/models/stella_models.py | 6 ++---- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mteb/models/jasper_models.py b/mteb/models/jasper_models.py index dbd1615ad8..d0ff4ab681 100644 --- a/mteb/models/jasper_models.py +++ b/mteb/models/jasper_models.py @@ -90,8 +90,17 @@ def encode( use_instructions=True, adapted_from=None, superseded_by=None, - training_datasets=nvidia_training_datasets, # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 - # "non_mteb": ["BAAI/Infinity-MM", "HuggingFaceFW/fineweb-edu"], - public_training_code=None, - public_training_data=None, + training_datasets={ + # stage 1, 2, 3 + # "In jasper model the teacher model is nvidia/NV-Embed-v2", source https://huggingface.co/infgrad/jasper_en_vision_language_v1 + **nvidia_training_datasets, + # fineweb-edu + # https://huggingface.co/datasets/sentence-transformers/embedding-training-data + # stage 4 + # BAAI/Infinity-MM + }, + # training logs https://api.wandb.ai/links/dunnzhang0/z8jqoqpb + # more codes https://huggingface.co/NovaSearch/jasper_en_vision_language_v1/commit/da9b77d56c23d9398fa8f93af449102784f74e1d + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", + public_training_data="https://huggingface.co/datasets/infgrad/jasper_text_distill_dataset", ) diff --git a/mteb/models/stella_models.py b/mteb/models/stella_models.py index 92d5db7c8a..9cc45a6e02 100644 --- a/mteb/models/stella_models.py +++ b/mteb/models/stella_models.py @@ -29,8 +29,7 @@ framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_400M_v5", training_datasets=None, - # will be at https://github.com/NLPJCL/RAG-Retrieval - public_training_code=None, + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", public_training_data=None, ) @@ -56,9 +55,8 @@ similarity_fn_name="cosine", framework=["Sentence Transformers", "PyTorch", "GritLM"], reference="https://huggingface.co/dunzhang/stella_en_1.5B_v5", - # will be at https://github.com/NLPJCL/RAG-Retrieval training_datasets=None, - public_training_code=None, + public_training_code="https://github.com/NovaSearch-Team/RAG-Retrieval/blob/c40f4638b705eb77d88305d2056901ed550f9f4b/rag_retrieval/train/embedding/README.md", public_training_data=None, ) From cc1e899229336253a253b381a2c2767f8213dd8a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:55:36 +0000 Subject: [PATCH 038/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..59a263dcb9 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 1c84c1cb103eae8654b7c482f22efe6a80aa9900 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 13:59:22 +0000 Subject: [PATCH 039/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 59a263dcb9..1ad19a3490 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From d6deab116274cad0889929cb00beb31a5999664e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:01:32 +0000 Subject: [PATCH 040/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1ad19a3490..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From ef929f86a91d6361828c6786246b0c255bea4068 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:03:51 +0000 Subject: [PATCH 041/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From a5d1538f58c688d42a5d2388540a023e12f332e8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:07:19 +0000 Subject: [PATCH 042/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..130d88f135 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 42c175f110a6190e08736332ebbb03c48991d8e7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:09:24 +0000 Subject: [PATCH 043/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 130d88f135..5b78dbeb9d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 7f9ca64c711eed7b10393ac512ea2d1492a1f31f Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 14:15:01 +0000 Subject: [PATCH 044/275] 1.31.5 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 258ed4c7c6..b291f3f40d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.4" +version = "1.31.5" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From e04218cbf376ccb04965da6b7860f4cfe696efba Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:17:05 +0000 Subject: [PATCH 045/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5b78dbeb9d..632532281c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 0a57880faeeb944a17004152c36dded85bcb218b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:19:20 +0000 Subject: [PATCH 046/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 632532281c..ba5b3675b7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From d44f9c3fa814165e6110dd5d3cd8c340d37ee405 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:21:23 +0000 Subject: [PATCH 047/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba5b3675b7..59a263dcb9 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From 35b2c09e800f956ced87c643dd9afbe248dfb834 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 29 Jan 2025 14:23:29 +0000 Subject: [PATCH 048/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 59a263dcb9..1ad19a3490 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -73,7 +73,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | From f3404b4b13377431c6d251926f13e338bc9f9b65 Mon Sep 17 00:00:00 2001 From: Mehran Sarmadi <128898167+mehran-sarmadi@users.noreply.github.com> Date: Thu, 30 Jan 2025 12:55:47 +0330 Subject: [PATCH 049/275] Feat: Add FaMTEB (Farsi/Persian Text Embedding Benchmark) (#1843) * Add Summary Retrieval Task * Add FaMTEBClassification * Add FaMTEBClustering * Add FaMTEBPairClassification * Add FaMTEBRetrieval and BEIRFA and FaMTEBSTS * Add FaMTEBSummaryRetrieval * Add FaMTEB to benchmarks * fix benchmark names * temporary fix metadata * Fix dataset revisions * Update SummaryRetrievalEvaluator.py * Update task files * Update task files * add data domain and subtask description * Update AbsTaskSummaryRetrieval and FaMTEBSummaryRetrieval * Update AbsTaskSummaryRetrieval * Add mock task * Update AbsTaskSummaryRetrieval * Update AbsTaskSummaryRetrieval * make lint * Refactor SummaryRetrieval to subclass BitextMining * Add aggregated datasets --------- Co-authored-by: mehran Co-authored-by: e.zeinivand Co-authored-by: Erfun76 <59398902+Erfun76@users.noreply.github.com> --- mteb/abstasks/TaskMetadata.py | 1 + mteb/benchmarks/benchmarks.py | 80 +++ mteb/tasks/Classification/__init__.py | 1 + .../fas/FaMTEBClassification.py | 635 +++++++++++++++++ mteb/tasks/Clustering/__init__.py | 1 + mteb/tasks/Clustering/fas/FaMTEBClustering.py | 211 ++++++ mteb/tasks/Clustering/fas/__init__.py | 0 mteb/tasks/PairClassification/__init__.py | 1 + .../fas/FaMTEBPairClassification.py | 282 ++++++++ mteb/tasks/Retrieval/__init__.py | 2 + mteb/tasks/Retrieval/fas/BEIRFa.py | 662 ++++++++++++++++++ mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py | 140 ++++ mteb/tasks/Retrieval/fas/__init__.py | 0 mteb/tasks/STS/__init__.py | 1 + mteb/tasks/STS/fas/FaMTEBSTS.py | 104 +++ mteb/tasks/STS/fas/__init__.py | 0 mteb/tasks/SummaryRetrieval/__init__.py | 3 + .../fas/FaMTEBSummaryRetrieval.py | 97 +++ mteb/tasks/SummaryRetrieval/fas/__init__.py | 0 mteb/tasks/__init__.py | 1 + .../CQADupStackRetrievalFa.py | 46 ++ .../SynPerChatbotConvSAClassification.py | 40 ++ mteb/tasks/aggregated_tasks/__init__.py | 8 +- tests/test_TaskMetadata.py | 2 + 24 files changed, 2317 insertions(+), 1 deletion(-) create mode 100644 mteb/tasks/Classification/fas/FaMTEBClassification.py create mode 100644 mteb/tasks/Clustering/fas/FaMTEBClustering.py create mode 100644 mteb/tasks/Clustering/fas/__init__.py create mode 100644 mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py create mode 100644 mteb/tasks/Retrieval/fas/BEIRFa.py create mode 100644 mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py create mode 100644 mteb/tasks/Retrieval/fas/__init__.py create mode 100644 mteb/tasks/STS/fas/FaMTEBSTS.py create mode 100644 mteb/tasks/STS/fas/__init__.py create mode 100644 mteb/tasks/SummaryRetrieval/__init__.py create mode 100644 mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py create mode 100644 mteb/tasks/SummaryRetrieval/fas/__init__.py create mode 100644 mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py create mode 100644 mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index f0f8a66143..248c81ac6f 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -96,6 +96,7 @@ "Summarization", "InstructionRetrieval", "Speed", + "SummaryRetrieval", ] diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index c1e5236ecf..eddf9d76bb 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1222,6 +1222,86 @@ def load_results( }""", ) +FA_MTEB = Benchmark( + name="FaMTEB(fas, beta)", + tasks=get_tasks( + languages=["fas"], + tasks=[ + # Classification + "PersianFoodSentimentClassification", + "SynPerChatbotConvSAClassification", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianTextTone", + "SIDClassification", + "DeepSentiPers", + "PersianTextEmotion", + "SentimentDKSF", + "NLPTwitterAnalysisClassification", + "DigikalamagClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + # Clustering + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", + # PairClassification + "FarsTail", + "CExaPPC", + "SynPerChatbotRAGFAQPC", + "FarsiParaphraseDetection", + "SynPerTextKeywordsPC", + "SynPerQAPC", + "ParsinluEntail", + "ParsinluQueryParaphPC", + # Reranking + "MIRACLReranking", + "WikipediaRerankingMultilingual", + # Retrieval + "SynPerQARetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "PersianWebDocumentRetrieval", + "WikipediaRetrievalMultilingual", + "MIRACLRetrieval", + "ClimateFEVER-Fa", + "DBPedia-Fa", + "HotpotQA-Fa", + "MSMARCO-Fa", + "NQ-Fa", + "ArguAna-Fa", + "CQADupstackRetrieval-Fa", + "FiQA2018-Fa", + "NFCorpus-Fa", + "QuoraRetrieval-Fa", + "SCIDOCS-Fa", + "SciFact-Fa", + "TRECCOVID-Fa", + "Touche2020-Fa", + # STS + "Farsick", + "SynPerSTS", + "Query2Query", + # SummaryRetrieval + "SAMSumFa", + "SynPerChatbotSumSRetrieval", + "SynPerChatbotRAGSumSRetrieval", + ], + ), + description="Main Persian (Farsi) benchmarks from MTEB", + reference=None, + citation=None, + contacts=["mehran-sarmadi", "ERfun", "morteza20"], +) + CHEMTEB = Benchmark( name="ChemTEB", tasks=get_tasks( diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 12b0623b6b..b2aab22714 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -57,6 +57,7 @@ from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * +from .fas.FaMTEBClassification import * from .fas.PersianFoodSentimentClassification import * from .fil.FilipinoHateSpeechClassification import * from .fil.FilipinoShopeeReviewsClassification import * diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py new file mode 100644 index 0000000000..43c7971429 --- /dev/null +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -0,0 +1,635 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SynPerChatbotConvSAAnger(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAAnger", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Anger", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-anger", + "revision": "5cae68b7fc094cb2fa6890a464e4d836e8107f5e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASatisfaction(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASatisfaction", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Satisfaction", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-satisfaction", + "revision": "50fd9d5d09edd53af89af765636be5db6f983f0e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFriendship(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFriendship", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Friendship", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-friendship", + "revision": "9dae119101e9b4e9bb40d5b9d29ffd7a621f9942", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFear(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFear", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Fear", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-fear", + "revision": "3c22f7e6bf4e366c86d69293c9164bf9e9d80aac", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAJealousy(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAJealousy", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Jealousy", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-jealousy", + "revision": "0d5104ecaa109d2448afe1f40dbf860f5e4458a8", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASurprise(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASurprise", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Surprise", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-surprise", + "revision": "62dad66fc2837b0ac5e5175fe7c265d2d502a386", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSALove(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSALove", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Love", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-love", + "revision": "0e000b2f73e9bb74ec8fc6da10011c52725b8469", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASadness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASadness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Sadness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-sadness", + "revision": "e9c678325565a5e4dadc43fd6693a8ccff1dd6b2", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAHappiness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAHappiness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Happiness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-happiness", + "revision": "e60893b7a8d01c9b8c12fadfe8f0a06e9d548a63", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneChatbotClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-tone-chatbot-classification", + "revision": "1f403cfadb85004fbf7e2480334fffc4c999b4ab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneUserClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone User", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/chatbot-conversational-sentiment-analysis-tone-user-classification", + "revision": "dd0f76661bef69819cc38c8a455b10af86ac3571", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotSatisfactionLevelClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotSatisfactionLevelClassification", + description="Synthetic Persian Chatbot Satisfaction Level Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-satisfaction-level-classification", + "revision": "e72db473602d750f1bcdc9f0436e1e3b967e088f", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneChatbotClassification", + description="Synthetic Persian Chatbot RAG Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-chatbot-classification", + "revision": "76f15a203fc13bd98a8f0fdddab1b68c28d7d674", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneUserClassification", + description="Synthetic Persian Chatbot RAG Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-user-classification", + "revision": "f1f6ad83bb135dc94fbf1ca05c3ba164f5619369", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneChatbotClassification", + description="Synthetic Persian Chatbot Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-chatbot-classification", + "revision": "a5a739a036fa7bb8ae0be91bc081fdd260d4bdab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneUserClassification", + description="Synthetic Persian Chatbot Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-user-classification", + "revision": "780d629437f7be127c6b287a61776372f9f333b9", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class PersianTextTone(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextTone", + description="Persian Text Tone", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/persian-text-tone", + "revision": "7144f4c6bdd77911df0dfc5a8bd44dba17e27e3a", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SIDClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SIDClassification", + description="SID Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/sid-classification", + "revision": "29bed651bb980395f5aa473607154d93226945e1", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class DeepSentiPers(AbsTaskClassification): + metadata = TaskMetadata( + name="DeepSentiPers", + description="Persian Sentiment Analysis Dataset", + reference="https://github.com/JoyeBright/DeepSentiPers", + dataset={ + "path": "PartAI/DeepSentiPers", + "revision": "ee4f09f404051761cfe14d68127532c82de41cb3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("review", "text") + + +class PersianTextEmotion(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextEmotion", + description="Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.", + reference="https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion", + dataset={ + "path": "SeyedAli/Persian-Text-Emotion", + "revision": "518fcd2c8b89917c7696770672688217a2eabf88", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SentimentDKSF(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentDKSF", + description="The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.", + reference="https://github.com/hezarai/hezar", + dataset={ + "path": "hezarai/sentiment-dksf", + "revision": "b4d5a8dd501db610b5ad89e9aa13f863b842b395", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class NLPTwitterAnalysisClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClassification", + description="Twitter Analysis Classification", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "text") + + +class DigikalamagClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="DigikalamagClassification", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("content", "text") diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index de27839290..65d8b01246 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -20,6 +20,7 @@ from .eng.WikiCitiesClustering import * from .eng.WikipediaChemistrySpecialtiesClustering import * from .eng.WikipediaChemistryTopicsClustering import * +from .fas.FaMTEBClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/fas/FaMTEBClustering.py b/mteb/tasks/Clustering/fas/FaMTEBClustering.py new file mode 100644 index 0000000000..da0b8b53f3 --- /dev/null +++ b/mteb/tasks/Clustering/fas/FaMTEBClustering.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import numpy as np +from datasets import Dataset, DatasetDict + +from mteb.abstasks.AbsTaskClusteringFast import ( + AbsTaskClusteringFast, + check_label_distribution, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BeytooteClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="BeytooteClustering", + description="Beytoote Website Articles Clustering", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/beytoote-clustering", + "revision": "62ca5aecb9414214162569f2f1bfb07aa219a70e", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class DigikalamagClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="DigikalamagClustering", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"label": "labels", "content": "sentences"} + ) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class HamshahriClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="HamshahriClustring", + description="These datasets have been extracted from the RSS feed of two Farsi news agency websites.", + reference="https://github.com/mallahyari/Farsi-datasets", + dataset={ + "path": "community-datasets/farsi_news", + "revision": "ca93dc707cea06cdb2e3ede3b547a1092053aca6", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.map( + lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"} + ) + self.dataset = self.dataset.map(lambda x: {"labels": x["tags"][0]}) + self.dataset = DatasetDict({"test": self.dataset["hamshahri"]}) + + ds = {} + for split in self.metadata.eval_splits: + labels = self.dataset[split]["labels"] + sentences = self.dataset[split]["sentences"] + + check_label_distribution(self.dataset[split]) + + # Remove sentences and labels with only 1 label example. + unique_labels, counts = np.unique(labels, return_counts=True) + solo_label_idx = np.where(counts == 1) + solo_labels = unique_labels[solo_label_idx] + is_solo = np.isin(labels, solo_labels) + split_ds = Dataset.from_dict({"labels": labels, "sentences": sentences}) + if is_solo.any(): + split_ds = split_ds.select(np.nonzero(is_solo == False)[0]) # noqa: E712 + ds[split] = split_ds + self.dataset = DatasetDict(ds) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class NLPTwitterAnalysisClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClustering", + description="Clustering of tweets from twitter across 26 categories.", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Clustering", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "sentences") + self.dataset = self.dataset.rename_column("label", "labels") + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class SIDClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="SIDClustring", + description="Clustering of summariesfrom SIDClustring across categories.", + reference="https://www.sid.com/", + dataset={ + "path": "MCINext/sid-clustering", + "revision": "d361bb18535d592e845aeaaa8ac47a239aa2f87a", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) diff --git a/mteb/tasks/Clustering/fas/__init__.py b/mteb/tasks/Clustering/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index e229195df0..6cd75ea144 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -11,6 +11,7 @@ from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * +from .fas.FaMTEBPairClassification import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py new file mode 100644 index 0000000000..6deba76d8d --- /dev/null +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CExaPPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="CExaPPC", + description="ExaPPC is a large paraphrase corpus consisting of monolingual sentence-level paraphrases using different sources.", + reference="https://github.com/exaco/exappc", + dataset={ + "path": "PNLPhub/C-ExaPPC", + "revision": "68a0ff474739367a36c8066ee04802a65aefc117", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Social", "Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "paraphrase" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerChatbotRAGFAQPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQPC", + description="Synthetic Persian Chatbot RAG FAQ Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-pair-classification", + "revision": "2128d809e27ab8528906e2231f8e824516fb8e5a", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class FarsiParaphraseDetection(AbsTaskPairClassification): + metadata = TaskMetadata( + name="FarsiParaphraseDetection", + description="Farsi Paraphrase Detection", + reference="https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection", + dataset={ + "path": "alighasemi/farsi_paraphrase_detection", + "revision": "c8129741af418d9ae43cfc1fc4f285704e26035f", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerTextKeywordsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerTextKeywordsPC", + description="Synthetic Persian Text Keywords Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-text-keyword-pair-classification", + "revision": "ea9a840cb163b415cc70b2f7adf2554feae159dc", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class SynPerQAPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerQAPC", + description="Synthetic Persian QA Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-qa-pair-classification", + "revision": "d1b62ef31bebbb48ae01867993a1e583c2ce7d93", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class ParsinluEntail(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluEntail", + description="A Persian textual entailment task (deciding sent1 entails sent2). The questions are partially translated from the SNLI dataset and partially generated by expert annotators.", + reference="https://github.com/persiannlp/parsinlu", + dataset={ + "path": "persiannlp/parsinlu_entailment", + "revision": "c49b2d8fa0d6476520695c52207690b7ec854043", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.filter(lambda x: x["label"] != "n") + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "e" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"], + "sentence2": self.dataset[split]["sent2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class ParsinluQueryParaphPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluQueryParaphPC", + description="A Persian query paraphrasng task (deciding whether two questions are paraphrases of each other). The questions are partially generated from Google auto-complete, and partially translated from the Quora paraphrasing dataset.", + reference="https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing", + dataset={ + "path": "persiannlp/parsinlu_query_paraphrasing", + "revision": "ec675bb3ac50c1a52317c101fe1d724b4601f47a", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "1" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["q1"], + "sentence2": self.dataset[split]["q2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 6c146379ea..291dd983c3 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -103,6 +103,8 @@ from .eng.TRECCOVIDRetrieval import * from .eng.WinoGrandeRetrieval import * from .est.estqa import * +from .fas.BEIRFa import * +from .fas.FaMTEBRetrieval import * from .fra.AlloprofRetrieval import * from .fra.BSARDRetrieval import * from .fra.FQuADRetrieval import * diff --git a/mteb/tasks/Retrieval/fas/BEIRFa.py b/mteb/tasks/Retrieval/fas/BEIRFa.py new file mode 100644 index 0000000000..0952eefff9 --- /dev/null +++ b/mteb/tasks/Retrieval/fas/BEIRFa.py @@ -0,0 +1,662 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ArguAnaFa(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="ArguAna-Fa", + description="ArguAna-Fa", + reference="https://huggingface.co/datasets/MCINext/arguana-fa", + dataset={ + "path": "MCINext/arguana-fa", + "revision": "fa97814be356fe4d18caadb457b4469bd34019ca", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Blog"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class ClimateFEVERFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER-Fa", + description="ClimateFEVER-Fa", + reference="https://huggingface.co/datasets/MCINext/climate-fever-fa", + dataset={ + "path": "MCINext/climate-fever-fa", + "revision": "45d9176b6fcba33abc58494ee82f18ee7e8ddbae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackAndroidRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackAndroidRetrieval-Fa", + description="CQADupstackAndroidRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-android-fa", + dataset={ + "path": "MCINext/cqadupstack-android-fa", + "revision": "bcdaf4e30477eea9b9b17ecbb153ca403e5c3ebd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackEnglishRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackEnglishRetrieval-Fa", + description="CQADupstackEnglishRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-english-fa", + dataset={ + "path": "MCINext/cqadupstack-english-fa", + "revision": "029a2e69e7d9e68b6bdc471073606104417a5be7", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGamingRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGamingRetrieval-Fa", + description="CQADupstackGamingRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa", + dataset={ + "path": "MCINext/cqadupstack-gaming-fa", + "revision": "e9c7ad03f29a55ab14eae730146961b8cdc14227", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGisRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGisRetrieval-Fa", + description="CQADupstackGisRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa", + dataset={ + "path": "MCINext/cqadupstack-gis-fa", + "revision": "e907c4144dc27bc8a035d78d69e15f39c56623a9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackMathematicaRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackMathematicaRetrieval-Fa", + description="CQADupstackMathematicaRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa", + dataset={ + "path": "MCINext/cqadupstack-mathematica-fa", + "revision": "b92e24fab42ab599535a19ee744de5485ec92f64", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackPhysicsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackPhysicsRetrieval-Fa", + description="CQADupstackPhysicsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa", + dataset={ + "path": "MCINext/cqadupstack-physics-fa", + "revision": "816ad7473d6813f77a0ca5e72b1ff7a52752d370", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackProgrammersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackProgrammersRetrieval-Fa", + description="CQADupstackProgrammersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa", + dataset={ + "path": "MCINext/cqadupstack-programmers-fa", + "revision": "be6460df57ab7c1b2c9fe295a31660dbd077ecf0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackStatsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackStatsRetrieval-Fa", + description="CQADupstackStatsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa", + dataset={ + "path": "MCINext/cqadupstack-stats-fa", + "revision": "c6e2c8b6153958118ec04352dd82a30ea2e2cad5", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackTexRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackTexRetrieval-Fa", + description="CQADupstackTexRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa", + dataset={ + "path": "MCINext/cqadupstack-tex-fa", + "revision": "860d152c86fda27229270b6bf4e832ff374ac01b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackUnixRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackUnixRetrieval-Fa", + description="CQADupstackUnixRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa", + dataset={ + "path": "MCINext/cqadupstack-unix-fa", + "revision": "c2a326387954aad66ff00d324a9278067b1e3bb6", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWebmastersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWebmastersRetrieval-Fa", + description="CQADupstackWebmastersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa", + dataset={ + "path": "MCINext/cqadupstack-webmasters-fa", + "revision": "506f29f8ce59648efe99afee736b0b158eced516", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWordpressRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWordpressRetrieval-Fa", + description="CQADupstackWordpressRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa", + dataset={ + "path": "MCINext/cqadupstack-wordpress-fa", + "revision": "7f755e88647b4023df52da04d4e3d31a7de5fcb0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class DBPediaFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DBPedia-Fa", + description="DBPedia-Fa", + reference="https://huggingface.co/datasets/MCINext/dbpedia-fa", + dataset={ + "path": "MCINext/dbpedia-fa", + "revision": "13529e6e301e9d72f86def882cfbca04791d83f9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class FiQA2018Fa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="FiQA2018-Fa", + description="FiQA2018-Fa", + reference="https://huggingface.co/datasets/MCINext/fiqa-fa", + dataset={ + "path": "MCINext/fiqa-fa", + "revision": "e683ce7ecd0b47edc3e29fda7cfd75335be4a997", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class HotpotQAFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HotpotQA-Fa", + description="HotpotQA-Fa", + reference="https://huggingface.co/datasets/MCINext/hotpotqa-fa", + dataset={ + "path": "MCINext/hotpotqa-fa", + "revision": "1cafde1306aa56b5dfdce0b14633ae9ee1a63ddb", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class MSMARCOFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="MSMARCO-Fa", + description="MSMARCO-Fa", + reference="https://huggingface.co/datasets/MCINext/msmarco-fa", + dataset={ + "path": "MCINext/msmarco-fa", + "revision": "88f90b0b04f91778ba5341095b0a9f1d7799ce10", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["dev"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NFCorpusFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NFCorpus-Fa", + description="NFCorpus-Fa", + reference="https://huggingface.co/datasets/MCINext/nfcorpus-fa", + dataset={ + "path": "MCINext/nfcorpus-fa", + "revision": "70aa71825a791e87210c0355a01f538aa611feae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NQFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NQ-Fa", + description="NQ-Fa", + reference="https://huggingface.co/datasets/MCINext/nq-fa", + dataset={ + "path": "MCINext/nq-fa", + "revision": "d4ea898b644c8d5f608b60947cb637bebbf1ac66", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class QuoraRetrievalFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="QuoraRetrieval-Fa", + description="QuoraRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/quora-fa", + dataset={ + "path": "MCINext/quora-fa", + "revision": "1a43f4f5dcd71e6b14b275ae82c3237cdd4fd5fd", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SCIDOCSFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SCIDOCS-Fa", + description="SCIDOCS-Fa", + reference="https://huggingface.co/datasets/MCINext/scidocs-fa", + dataset={ + "path": "MCINext/scidocs-fa", + "revision": "6611ebf4b4c1aaf8b021e4728440db2188291b8b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SciFactFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SciFact-Fa", + description="SciFact-Fa", + reference="https://huggingface.co/datasets/MCINext/scifact-fa", + dataset={ + "path": "MCINext/scifact-fa", + "revision": "7723397096199c4d6f367b445fccaf282c518abe", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class TRECCOVIDFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TRECCOVID-Fa", + description="TRECCOVID-Fa", + reference="https://huggingface.co/datasets/MCINext/trec-covid-fa", + dataset={ + "path": "MCINext/trec-covid-fa", + "revision": "98e6c2d33dfa166ee326e8b1bc7ea82c7e6898dd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class Touche2020Fa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020-Fa", + description="Touche2020-Fa", + reference="https://huggingface.co/datasets/MCINext/touche2020-fa", + dataset={ + "path": "MCINext/touche2020-fa", + "revision": "0f464636f91641cc6ef6f6f8f249c73f4a609982", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py new file mode 100644 index 0000000000..875f7ea7db --- /dev/null +++ b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class SynPerQARetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerQARetrieval", + description="Synthetic Persian QA Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings", + dataset={ + "path": "MCINext/synthetic-persian-qa-retrieval", + "revision": "e85114f13f42dc1edc456d58931cc38d44d697cf", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotTopicsRetrieval", + description="Synthetic Persian Chatbot Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-topics-retrieval", + "revision": "086995ca4cea33f37a407c2fa5282f74913740ee", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGTopicsRetrieval", + description="Synthetic Persian Chatbot RAG Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + "revision": "da8f36a723da155738f5e3d8d84d543589bd5083", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGFAQRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQRetrieval", + description="Synthetic Persian Chatbot RAG FAQ Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + "revision": "9d32af6540970e2845028cbfffe6b0d0e8f52428", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class PersianWebDocumentRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="PersianWebDocumentRetrieval", + description="Persian dataset designed specifically for the task of text information retrieval through the web.", + reference="https://ieeexplore.ieee.org/document/10553090", + dataset={ + "path": "MCINext/persian-web-document-retrieval", + "revision": "b3dc818368a867b30ccb55a42ff287d253512c36", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + ) diff --git a/mteb/tasks/Retrieval/fas/__init__.py b/mteb/tasks/Retrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index b61b79b293..471789f1c9 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -10,6 +10,7 @@ from .eng.STS16STS import * from .eng.STSBenchmarkSTS import * from .fao.FaroeseSTS import * +from .fas.FaMTEBSTS import * from .fin.FinParaSTS import * from .fra.SickFrSTS import * from .jpn.JSICK import * diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py new file mode 100644 index 0000000000..2ce9522cd4 --- /dev/null +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskSTS import AbsTaskSTS + + +class Farsick(AbsTaskSTS): + metadata = TaskMetadata( + name="Farsick", + description="A Persian Semantic Textual Similarity And Natural Language Inference Dataset", + reference="https://github.com/ZahraGhasemi-AI/FarSick", + dataset={ + "path": "MCINext/farsick-sts", + "revision": "f8b8d630f631c6c16b7bc3cb924bdf62a51bed06", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class SynPerSTS(AbsTaskSTS): + metadata = TaskMetadata( + name="SynPerSTS", + description="Synthetic Persian Semantic Textual Similarity Dataset", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-sts", + "revision": "914047db08928b5326d8b106583dc563b73d1ecf", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class Query2Query(AbsTaskSTS): + metadata = TaskMetadata( + name="Query2Query", + description="Query to Query Datasets.", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/query-to-query-sts", + "revision": "52602079f9032fcf181775a310d79d2f197534e4", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 2 + return metadata_dict diff --git a/mteb/tasks/STS/fas/__init__.py b/mteb/tasks/STS/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/SummaryRetrieval/__init__.py b/mteb/tasks/SummaryRetrieval/__init__.py new file mode 100644 index 0000000000..d000983be9 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .fas.FaMTEBSummaryRetrieval import * diff --git a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py new file mode 100644 index 0000000000..cf3f9dbe52 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SAMSumFa(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SAMSumFa", + description="Translated Version of SAMSum Dataset", + reference="https://huggingface.co/datasets/MCINext/samsum-fa", + dataset={ + "path": "MCINext/samsum-fa", + "revision": "fd981d78a0ab82c20d2e693a8b3929c5d71b0743", + }, + type="SummaryRetrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotSumSRetrieval", + description="Synthetic Persian Chatbot Summary Dataset", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-summary-retrieval", + "revision": "9002f5e9de4ef61f1f5c34831d2a5ed855bac0ae", + }, + type="SummaryRetrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotRAGSumSRetrieval", + description="Synthetic Persian Chatbot RAG Summary Dataset", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + "revision": "f77746f286bbf2177ee7b5a803da8be440d5d4c1", + }, + type="SummaryRetrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) diff --git a/mteb/tasks/SummaryRetrieval/fas/__init__.py b/mteb/tasks/SummaryRetrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 8a53eb0d7d..8877da707c 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -12,3 +12,4 @@ from .SpeedTask import * from .STS import * from .Summarization import * +from .SummaryRetrieval import * diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py new file mode 100644 index 0000000000..6a60f4b000 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Retrieval import ( + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrievalFa, +) + +task_list_cqa: list[AbsTask] = [ + CQADupstackAndroidRetrievalFa(), + CQADupstackEnglishRetrievalFa(), + CQADupstackGamingRetrievalFa(), + CQADupstackGisRetrievalFa(), + CQADupstackMathematicaRetrievalFa(), + CQADupstackPhysicsRetrievalFa(), + CQADupstackProgrammersRetrievalFa(), + CQADupstackStatsRetrievalFa(), + CQADupstackTexRetrievalFa(), + CQADupstackUnixRetrievalFa(), + CQADupstackWebmastersRetrievalFa(), + CQADupstackWordpressRetrievalFa(), +] + + +class CQADupstackRetrievalFa(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="CQADupstackRetrieval-Fa", + description="CQADupstackRetrieval-Fa", + reference="", + tasks=task_list_cqa, + main_score="ndcg_at_10", + type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py new file mode 100644 index 0000000000..46c6ed9600 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Classification import ( + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, +) + +task_list_cqa: list[AbsTask] = [ + SynPerChatbotConvSAAnger(), + SynPerChatbotConvSASatisfaction(), + SynPerChatbotConvSAFriendship(), + SynPerChatbotConvSAFear(), + SynPerChatbotConvSAJealousy(), + SynPerChatbotConvSASurprise(), + SynPerChatbotConvSALove(), + SynPerChatbotConvSASadness(), + SynPerChatbotConvSAHappiness(), +] + + +class SynPerChatbotConvSAClassification(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="SynPerChatbotConvSAClassification", + description="SynPerChatbotConvSAClassification", + reference="", + tasks=task_list_cqa, + main_score="accuracy", + type="Classification", + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py index 15025208cc..5333db7916 100644 --- a/mteb/tasks/aggregated_tasks/__init__.py +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -1,5 +1,11 @@ from __future__ import annotations from .CQADupStackRetrieval import CQADupstackRetrieval +from .CQADupStackRetrievalFa import CQADupstackRetrievalFa +from .SynPerChatbotConvSAClassification import SynPerChatbotConvSAClassification -__all__ = ["CQADupstackRetrieval"] +__all__ = [ + "CQADupstackRetrieval", + "CQADupstackRetrievalFa", + "SynPerChatbotConvSAClassification", +] diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 09df66628a..f7ac92a697 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -179,6 +179,8 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", + "SynPerChatbotConvSAClassification", + "CQADupstackRetrieval-Fa", ] From 0a59704cb3711f01be8f211cee005033b7aadb20 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 09:27:50 +0000 Subject: [PATCH 050/275] Update tasks table --- docs/tasks.md | 2183 +++++++++++++++++++++++++------------------------ 1 file changed, 1128 insertions(+), 1055 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1ad19a3490..cee6247578 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -30,6 +30,7 @@ The following tables give you an overview of the tasks in MTEB. | [ArXivHierarchicalClusteringP2P](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 2065284, 'min_text_length': 103, 'average_text_length': 1008.44, 'max_text_length': 2103, 'min_labels_per_text': 1, 'average_labels_per_text': 1.46, 'max_labels_per_text': 381, 'unique_labels': 129, 'labels': {'cs': {'count': 356}, 'math': {'count': 381}, 'OC': {'count': 11}, 'hep-lat': {'count': 13}, 'hep': {'count': 98}, 'astro-ph': {'count': 213}, 'eess': {'count': 76}, 'quant-ph': {'count': 135}, 'DC': {'count': 5}, 'cond-mat': {'count': 274}, 'hep-th': {'count': 66}, 'SP': {'count': 33}, 'hep-ph': {'count': 69}, 'FA': {'count': 6}, 'nucl-th': {'count': 17}, 'q-bio': {'count': 80}, 'HE': {'count': 22}, 'HC': {'count': 2}, 'stat': {'count': 60}, 'ML': {'count': 16}, 'IV': {'count': 13}, 'stat-mech': {'count': 47}, 'DS': {'count': 14}, 'ME': {'count': 12}, 'CC': {'count': 2}, 'mtrl-sci': {'count': 22}, 'PE': {'count': 16}, 'NT': {'count': 11}, 'SC': {'count': 6}, 'AG': {'count': 13}, 'physics': {'count': 81}, 'ins-det': {'count': 9}, 'GA': {'count': 18}, 'BM': {'count': 6}, 'GN': {'count': 17}, 'NA': {'count': 15}, 'app-ph': {'count': 7}, 'RT': {'count': 6}, 'other': {'count': 37}, 'soft': {'count': 15}, 'CO': {'count': 33}, 'supr-con': {'count': 21}, 'chem-ph': {'count': 3}, 'DM': {'count': 2}, 'MN': {'count': 12}, 'q-fin': {'count': 27}, 'PM': {'count': 2}, 'AP': {'count': 27}, 'gr-qc': {'count': 15}, 'quant-gas': {'count': 8}, 'mes-hall': {'count': 33}, 'IT': {'count': 19}, 'SI': {'count': 6}, 'SG': {'count': 3}, 'bio-ph': {'count': 2}, 'SR': {'count': 16}, 'soc-ph': {'count': 5}, 'hep-ex': {'count': 15}, 'DG': {'count': 11}, 'NE': {'count': 5}, 'CR': {'count': 6}, 'CL': {'count': 12}, 'RM': {'count': 3}, 'econ': {'count': 17}, 'nlin': {'count': 5}, 'PS': {'count': 1}, 'LG': {'count': 26}, 'QA': {'count': 9}, 'str-el': {'count': 26}, 'CV': {'count': 34}, 'MF': {'count': 6}, 'IM': {'count': 7}, 'EM': {'count': 6}, 'TH': {'count': 5}, 'PR': {'count': 20}, 'AT': {'count': 4}, 'OA': {'count': 4}, 'CP': {'count': 6}, 'LO': {'count': 14}, 'flu-dyn': {'count': 6}, 'atom-ph': {'count': 8}, 'class-ph': {'count': 1}, 'SY': {'count': 20}, 'IR': {'count': 1}, 'plasm-ph': {'count': 8}, 'CE': {'count': 2}, 'AO': {'count': 1}, 'comp-ph': {'count': 3}, 'optics': {'count': 12}, 'MG': {'count': 4}, 'ST': {'count': 6}, 'nucl-ex': {'count': 6}, 'CY': {'count': 9}, 'ao-ph': {'count': 2}, 'DB': {'count': 1}, 'math-ph': {'count': 10}, 'NC': {'count': 13}, 'GT': {'count': 11}, 'TO': {'count': 2}, 'AI': {'count': 9}, 'NI': {'count': 2}, 'gen-ph': {'count': 4}, 'OT': {'count': 4}, 'SD': {'count': 2}, 'dis-nn': {'count': 4}, 'RO': {'count': 7}, 'CA': {'count': 6}, 'FL': {'count': 1}, 'SE': {'count': 5}, 'EP': {'count': 9}, 'hist-ph': {'count': 1}, 'QM': {'count': 9}, 'ed-ph': {'count': 2}, 'GR': {'count': 4}, 'MS': {'count': 1}, 'CD': {'count': 1}, 'ET': {'count': 1}, 'acc-ph': {'count': 5}, 'AC': {'count': 2}, 'OH': {'count': 1}, 'EC': {'count': 2}, 'DL': {'count': 1}, 'AS': {'count': 3}, 'geo-ph': {'count': 2}, 'CG': {'count': 3}, 'CB': {'count': 1}, 'AR': {'count': 1}, 'TR': {'count': 1}, 'atm-clus': {'count': 1}}}} | | [ArXivHierarchicalClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [ArguAna](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | +| [ArguAna-Fa](https://huggingface.co/datasets/MCINext/arguana-fa) | ['fas'] | Retrieval | s2p | [Blog] | None | None | | [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | None | None | | [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | @@ -46,6 +47,7 @@ The following tables give you an overview of the tasks in MTEB. | [BengaliDocumentClassification](https://aclanthology.org/2023.eacl-main.4) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliHateSpeechClassification](https://huggingface.co/datasets/bn_hate_speech) (Karim et al., 2020) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliSentimentAnalysis](https://data.mendeley.com/datasets/p6zc7krs37/4) (Sazzed et al., 2020) | ['ben'] | Classification | s2s | [Reviews, Written] | None | None | +| [BeytooteClustering](https://mcinext.com/) | ['fas'] | Clustering | p2p | [News] | None | None | | [BibleNLPBitextMining](https://arxiv.org/abs/2304.09919) (Akerman et al., 2023) | ['aai', 'aak', 'aau', 'aaz', 'abt', 'abx', 'aby', 'acf', 'acr', 'acu', 'adz', 'aer', 'aey', 'agd', 'agg', 'agm', 'agn', 'agr', 'agt', 'agu', 'aia', 'aii', 'aka', 'ake', 'alp', 'alq', 'als', 'aly', 'ame', 'amf', 'amk', 'amm', 'amn', 'amo', 'amp', 'amr', 'amu', 'amx', 'anh', 'anv', 'aoi', 'aoj', 'aom', 'aon', 'apb', 'ape', 'apn', 'apr', 'apu', 'apw', 'apz', 'arb', 'are', 'arl', 'arn', 'arp', 'asm', 'aso', 'ata', 'atb', 'atd', 'atg', 'att', 'auc', 'aui', 'auy', 'avt', 'awb', 'awk', 'awx', 'azb', 'azg', 'azz', 'bao', 'bba', 'bbb', 'bbr', 'bch', 'bco', 'bdd', 'bea', 'bef', 'bel', 'ben', 'beo', 'beu', 'bgs', 'bgt', 'bhg', 'bhl', 'big', 'bjk', 'bjp', 'bjr', 'bjv', 'bjz', 'bkd', 'bki', 'bkq', 'bkx', 'blw', 'blz', 'bmh', 'bmk', 'bmr', 'bmu', 'bnp', 'boa', 'boj', 'bon', 'box', 'bpr', 'bps', 'bqc', 'bqp', 'bre', 'bsj', 'bsn', 'bsp', 'bss', 'buk', 'bus', 'bvd', 'bvr', 'bxh', 'byr', 'byx', 'bzd', 'bzh', 'bzj', 'caa', 'cab', 'cac', 'caf', 'cak', 'cao', 'cap', 'car', 'cav', 'cax', 'cbc', 'cbi', 'cbk', 'cbr', 'cbs', 'cbt', 'cbu', 'cbv', 'cco', 'ceb', 'cek', 'ces', 'cgc', 'cha', 'chd', 'chf', 'chk', 'chq', 'chz', 'cjo', 'cjv', 'ckb', 'cle', 'clu', 'cme', 'cmn', 'cni', 'cnl', 'cnt', 'cof', 'con', 'cop', 'cot', 'cpa', 'cpb', 'cpc', 'cpu', 'cpy', 'crn', 'crx', 'cso', 'csy', 'cta', 'cth', 'ctp', 'ctu', 'cub', 'cuc', 'cui', 'cuk', 'cut', 'cux', 'cwe', 'cya', 'daa', 'dad', 'dah', 'dan', 'ded', 'deu', 'dgc', 'dgr', 'dgz', 'dhg', 'dif', 'dik', 'dji', 'djk', 'djr', 'dob', 'dop', 'dov', 'dwr', 'dww', 'dwy', 'ebk', 'eko', 'emi', 'emp', 'eng', 'enq', 'epo', 'eri', 'ese', 'esk', 'etr', 'ewe', 'faa', 'fai', 'far', 'ffm', 'for', 'fra', 'fue', 'fuf', 'fuh', 'gah', 'gai', 'gam', 'gaw', 'gdn', 'gdr', 'geb', 'gfk', 'ghs', 'glk', 'gmv', 'gng', 'gnn', 'gnw', 'gof', 'grc', 'gub', 'guh', 'gui', 'guj', 'gul', 'gum', 'gun', 'guo', 'gup', 'gux', 'gvc', 'gvf', 'gvn', 'gvs', 'gwi', 'gym', 'gyr', 'hat', 'hau', 'haw', 'hbo', 'hch', 'heb', 'heg', 'hin', 'hix', 'hla', 'hlt', 'hmo', 'hns', 'hop', 'hot', 'hrv', 'hto', 'hub', 'hui', 'hun', 'hus', 'huu', 'huv', 'hvn', 'ian', 'ign', 'ikk', 'ikw', 'ilo', 'imo', 'inb', 'ind', 'ino', 'iou', 'ipi', 'isn', 'ita', 'iws', 'ixl', 'jac', 'jae', 'jao', 'jic', 'jid', 'jiv', 'jni', 'jpn', 'jvn', 'kan', 'kaq', 'kbc', 'kbh', 'kbm', 'kbq', 'kdc', 'kde', 'kdl', 'kek', 'ken', 'kew', 'kgf', 'kgk', 'kgp', 'khs', 'khz', 'kik', 'kiw', 'kiz', 'kje', 'kjs', 'kkc', 'kkl', 'klt', 'klv', 'kmg', 'kmh', 'kmk', 'kmo', 'kms', 'kmu', 'kne', 'knf', 'knj', 'knv', 'kos', 'kpf', 'kpg', 'kpj', 'kpr', 'kpw', 'kpx', 'kqa', 'kqc', 'kqf', 'kql', 'kqw', 'ksd', 'ksj', 'ksr', 'ktm', 'kto', 'kud', 'kue', 'kup', 'kvg', 'kvn', 'kwd', 'kwf', 'kwi', 'kwj', 'kyc', 'kyf', 'kyg', 'kyq', 'kyz', 'kze', 'lac', 'lat', 'lbb', 'lbk', 'lcm', 'leu', 'lex', 'lgl', 'lid', 'lif', 'lin', 'lit', 'llg', 'lug', 'luo', 'lww', 'maa', 'maj', 'mal', 'mam', 'maq', 'mar', 'mau', 'mav', 'maz', 'mbb', 'mbc', 'mbh', 'mbj', 'mbl', 'mbs', 'mbt', 'mca', 'mcb', 'mcd', 'mcf', 'mco', 'mcp', 'mcq', 'mcr', 'mdy', 'med', 'mee', 'mek', 'meq', 'met', 'meu', 'mgc', 'mgh', 'mgw', 'mhl', 'mib', 'mic', 'mie', 'mig', 'mih', 'mil', 'mio', 'mir', 'mit', 'miz', 'mjc', 'mkj', 'mkl', 'mkn', 'mks', 'mle', 'mlh', 'mlp', 'mmo', 'mmx', 'mna', 'mop', 'mox', 'mph', 'mpj', 'mpm', 'mpp', 'mps', 'mpt', 'mpx', 'mqb', 'mqj', 'msb', 'msc', 'msk', 'msm', 'msy', 'mti', 'mto', 'mux', 'muy', 'mva', 'mvn', 'mwc', 'mwe', 'mwf', 'mwp', 'mxb', 'mxp', 'mxq', 'mxt', 'mya', 'myk', 'myu', 'myw', 'myy', 'mzz', 'nab', 'naf', 'nak', 'nas', 'nbq', 'nca', 'nch', 'ncj', 'ncl', 'ncu', 'ndg', 'ndj', 'nfa', 'ngp', 'ngu', 'nhe', 'nhg', 'nhi', 'nho', 'nhr', 'nhu', 'nhw', 'nhy', 'nif', 'nii', 'nin', 'nko', 'nld', 'nlg', 'nna', 'nnq', 'noa', 'nop', 'not', 'nou', 'npi', 'npl', 'nsn', 'nss', 'ntj', 'ntp', 'ntu', 'nuy', 'nvm', 'nwi', 'nya', 'nys', 'nyu', 'obo', 'okv', 'omw', 'ong', 'ons', 'ood', 'opm', 'ory', 'ote', 'otm', 'otn', 'otq', 'ots', 'pab', 'pad', 'pah', 'pan', 'pao', 'pes', 'pib', 'pio', 'pir', 'piu', 'pjt', 'pls', 'plu', 'pma', 'poe', 'poh', 'poi', 'pol', 'pon', 'por', 'poy', 'ppo', 'prf', 'pri', 'ptp', 'ptu', 'pwg', 'qub', 'quc', 'quf', 'quh', 'qul', 'qup', 'qvc', 'qve', 'qvh', 'qvm', 'qvn', 'qvs', 'qvw', 'qvz', 'qwh', 'qxh', 'qxn', 'qxo', 'rai', 'reg', 'rgu', 'rkb', 'rmc', 'rmy', 'ron', 'roo', 'rop', 'row', 'rro', 'ruf', 'rug', 'rus', 'rwo', 'sab', 'san', 'sbe', 'sbk', 'sbs', 'seh', 'sey', 'sgb', 'sgz', 'shj', 'shp', 'sim', 'sja', 'sll', 'smk', 'snc', 'snn', 'snp', 'snx', 'sny', 'som', 'soq', 'soy', 'spa', 'spl', 'spm', 'spp', 'sps', 'spy', 'sri', 'srm', 'srn', 'srp', 'srq', 'ssd', 'ssg', 'ssx', 'stp', 'sua', 'sue', 'sus', 'suz', 'swe', 'swh', 'swp', 'sxb', 'tac', 'taj', 'tam', 'tav', 'taw', 'tbc', 'tbf', 'tbg', 'tbo', 'tbz', 'tca', 'tcs', 'tcz', 'tdt', 'tee', 'tel', 'ter', 'tet', 'tew', 'tfr', 'tgk', 'tgl', 'tgo', 'tgp', 'tha', 'tif', 'tim', 'tiw', 'tiy', 'tke', 'tku', 'tlf', 'tmd', 'tna', 'tnc', 'tnk', 'tnn', 'tnp', 'toc', 'tod', 'tof', 'toj', 'ton', 'too', 'top', 'tos', 'tpa', 'tpi', 'tpt', 'tpz', 'trc', 'tsw', 'ttc', 'tte', 'tuc', 'tue', 'tuf', 'tuo', 'tur', 'tvk', 'twi', 'txq', 'txu', 'tzj', 'tzo', 'ubr', 'ubu', 'udu', 'uig', 'ukr', 'uli', 'ulk', 'upv', 'ura', 'urb', 'urd', 'uri', 'urt', 'urw', 'usa', 'usp', 'uvh', 'uvl', 'vid', 'vie', 'viv', 'vmy', 'waj', 'wal', 'wap', 'wat', 'wbi', 'wbp', 'wed', 'wer', 'wim', 'wiu', 'wiv', 'wmt', 'wmw', 'wnc', 'wnu', 'wol', 'wos', 'wrk', 'wro', 'wrs', 'wsk', 'wuv', 'xav', 'xbi', 'xed', 'xla', 'xnn', 'xon', 'xsi', 'xtd', 'xtm', 'yaa', 'yad', 'yal', 'yap', 'yaq', 'yby', 'ycn', 'yka', 'yle', 'yml', 'yon', 'yor', 'yrb', 'yre', 'yss', 'yuj', 'yut', 'yuw', 'yva', 'zaa', 'zab', 'zac', 'zad', 'zai', 'zaj', 'zam', 'zao', 'zap', 'zar', 'zas', 'zat', 'zav', 'zaw', 'zca', 'zga', 'zia', 'ziw', 'zlm', 'zos', 'zpc', 'zpl', 'zpm', 'zpo', 'zpq', 'zpu', 'zpv', 'zpz', 'zsr', 'ztq', 'zty', 'zyp'] | BitextMining | s2s | [Religious, Written] | None | None | | [BigPatentClustering.v2](https://huggingface.co/datasets/NortheasternUniversity/big_patent) (Eva Sharma and Chen Li and Lu Wang, 2019) | ['eng'] | Clustering | p2p | [Legal, Written] | None | None | | [BiorxivClusteringP2P.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | @@ -60,6 +62,7 @@ The following tables give you an overview of the tasks in MTEB. | [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None | | [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Blog, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | +| [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | | [CLSClusteringP2P.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | p2p | [Academic, Written] | None | None | | [CLSClusteringS2S.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | s2s | [Academic, Written] | None | None | | [CMedQAv1-reranking](https://github.com/zhangsheng93/cMedQA) (Zhang et al., 2017) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None | @@ -67,18 +70,31 @@ The following tables give you an overview of the tasks in MTEB. | [COIRCodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1056326} | {'test': {'number_of_characters': 36843313, 'num_samples': 1056326, 'num_queries': 52561, 'num_documents': 1003765, 'min_document_length': 54, 'average_document_length': 34.71, 'max_document_length': 334374, 'unique_documents': 1003765, 'min_query_length': 2, 'average_query_length': 38.19, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 14574651, 'num_samples': 295228, 'num_queries': 14918, 'num_documents': 280310, 'min_document_length': 95, 'average_document_length': 49.99, 'max_document_length': 14008, 'unique_documents': 280310, 'min_query_length': 2, 'average_query_length': 37.58, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 2587540, 'num_samples': 68145, 'num_queries': 3291, 'num_documents': 64854, 'min_document_length': 87, 'average_document_length': 37.9, 'max_document_length': 334374, 'unique_documents': 64854, 'min_query_length': 2, 'average_query_length': 39.41, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 3641108, 'num_samples': 190562, 'num_queries': 8122, 'num_documents': 182440, 'min_document_length': 54, 'average_document_length': 17.96, 'max_document_length': 5280, 'unique_documents': 182440, 'min_query_length': 2, 'average_query_length': 44.92, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 629446, 'num_samples': 28831, 'num_queries': 1261, 'num_documents': 27570, 'min_document_length': 83, 'average_document_length': 20.83, 'max_document_length': 3992, 'unique_documents': 27570, 'min_query_length': 2, 'average_query_length': 43.73, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 6791137, 'num_samples': 191821, 'num_queries': 10955, 'num_documents': 180866, 'min_document_length': 77, 'average_document_length': 35.55, 'max_document_length': 7615, 'unique_documents': 180866, 'min_query_length': 2, 'average_query_length': 33.02, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 8619431, 'num_samples': 281739, 'num_queries': 14014, 'num_documents': 267725, 'min_document_length': 94, 'average_document_length': 30.2, 'max_document_length': 4904, 'unique_documents': 267725, 'min_query_length': 2, 'average_query_length': 38.21, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | | [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackAndroidRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-android-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackEnglishRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-english-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackGamingRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackGisRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackMathematicaRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | +| [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackTexRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackUnixRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackWebmastersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | | [CTKFactsNLI](https://arxiv.org/abs/2201.11115) (Ullrich et al., 2023) | ['ces'] | PairClassification | s2s | [News, Written] | None | None | @@ -126,6 +142,7 @@ The following tables give you an overview of the tasks in MTEB. | [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | @@ -161,6 +178,7 @@ The following tables give you an overview of the tasks in MTEB. | [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia-Fa](https://huggingface.co/datasets/MCINext/dbpedia-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | | [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | | [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | @@ -169,8 +187,11 @@ The following tables give you an overview of the tasks in MTEB. | [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | | [DanFeverRetrieval](https://aclanthology.org/2021.nodalida-main.47/) | ['dan'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Spoken] | None | None | | [DanishPoliticalCommentsClassification](https://huggingface.co/datasets/danish_political_comments) (Mads Guldborg Kjeldgaard Kongsbak, 2019) | ['dan'] | Classification | s2s | [Social, Written] | None | None | +| [DeepSentiPers](https://github.com/JoyeBright/DeepSentiPers) | ['fas'] | Classification | s2s | [Reviews] | None | None | | [DefinitionClassificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [DiaBlaBitextMining](https://inria.hal.science/hal-03021633) (González et al., 2019) | ['eng', 'fra'] | BitextMining | s2s | [Social, Written] | None | None | +| [DigikalamagClassification](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Classification | p2p | [Web] | None | None | +| [DigikalamagClustering](https://hooshvare.github.io/docs/datasets/tc) | ['fas'] | Clustering | p2p | [Web] | None | None | | [Diversity1LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity2LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [Diversity3LegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -192,9 +213,12 @@ The following tables give you an overview of the tasks in MTEB. | [FalseFriendsGermanEnglish](https://drive.google.com/file/d/1jgq0nBnV-UiYNxbKNrrr2gxDEHm-DMKH/view?usp=share_link) | ['deu'] | PairClassification | s2s | [Written] | None | None | | [FaroeseSTS](https://aclanthology.org/2023.nodalida-1.74.pdf) | ['fao'] | STS | s2s | [News, Web, Written] | None | None | | [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None | +| [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None | +| [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None | | [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | None | None | | [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | None | | [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | @@ -222,6 +246,7 @@ The following tables give you an overview of the tasks in MTEB. | [GujaratiNewsClassification](https://github.com/goru001/nlp-for-gujarati) | ['guj'] | Classification | s2s | [News, Written] | None | None | | [HALClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/clustering-hal-s2s) (Mathieu Ciancone, 2024) | ['fra'] | Clustering | s2s | [Academic, Written] | None | None | | [HagridRetrieval](https://github.com/project-miracl/hagrid) (Ehsan Kamalloo, 2023) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [HamshahriClustring](https://github.com/mallahyari/Farsi-datasets) | ['fas'] | Clustering | p2p | [News] | None | None | | [HateSpeechPortugueseClassification](https://aclanthology.org/W19-3510) | ['por'] | Classification | s2s | [Social, Written] | None | None | | [HeadlineClassification](https://aclanthology.org/2020.ngt-1.6/) | ['rus'] | Classification | s2s | [News, Written] | None | None | | [HebrewSentimentAnalysis](https://huggingface.co/datasets/hebrew_sentiment) | ['heb'] | Classification | s2s | [Reviews, Written] | None | None | @@ -230,6 +255,7 @@ The following tables give you an overview of the tasks in MTEB. | [HindiDiscourseClassification](https://aclanthology.org/2020.lrec-1.149/) | ['hin'] | Classification | s2s | [Fiction, Social, Written] | None | None | | [HotelReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3) (Elnagar et al., 2018) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [HotpotQA](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | +| [HotpotQA-Fa](https://huggingface.co/datasets/MCINext/hotpotqa-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [HotpotQA-PL](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQA-PLHardNegatives](https://hotpotqa.github.io/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [HotpotQAHardNegatives](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | @@ -320,6 +346,7 @@ The following tables give you an overview of the tasks in MTEB. | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | | [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | @@ -352,11 +379,15 @@ The following tables give you an overview of the tasks in MTEB. | [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | | [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | | [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | +| [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleAbsRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | +| [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | +| [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | | [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | +| [NQ-Fa](https://huggingface.co/datasets/MCINext/nq-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | @@ -417,9 +448,14 @@ The following tables give you an overview of the tasks in MTEB. | [PIQA](https://arxiv.org/abs/1911.11641) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [PROALegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PSC](http://www.lrec-conf.org/proceedings/lrec2014/pdf/1211_Paper.pdf) | ['pol'] | PairClassification | s2s | [News, Written] | None | None | +| [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None | +| [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None | | [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Web, Encyclopaedic, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | | [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None | +| [PersianTextEmotion](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | +| [PersianTextTone](https://mcinext.com/) | ['fas'] | Classification | s2p | | None | None | +| [PersianWebDocumentRetrieval](https://ieeexplore.ieee.org/document/10553090) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [PersonalJurisdictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [PhincBitextMining](https://huggingface.co/datasets/veezbo/phinc) (Srivastava et al., 2020) | ['eng', 'hin'] | BitextMining | s2s | [Social, Written] | {'train': 13738} | {'train': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736, 'hf_subset_descriptive_stats': {'eng-eng_hin': {'num_samples': 13738, 'number_of_characters': 2069457, 'unique_pairs': 13737, 'min_sentence1_length': 1, 'average_sentence1_length': 74.02, 'max_sentence1_length': 278, 'unique_sentence1': 13515, 'min_sentence2_length': 3, 'average_sentence2_length': 76.61, 'max_sentence2_length': 274, 'unique_sentence2': 13736}}}} | | [PlscClusteringP2P.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | @@ -438,9 +474,11 @@ The following tables give you an overview of the tasks in MTEB. | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | | [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | +| [Query2Query](https://mcinext.com/) | ['fas'] | STS | s2s | | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | +| [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | | [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | | [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -465,6 +503,7 @@ The following tables give you an overview of the tasks in MTEB. | [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'min_text_length': 84, 'average_text_length': 889.81, 'max_text_length': 3143, 'min_labels_per_text': 73, 'average_labels_per_text': 1.0, 'max_labels_per_text': 74, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | | [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | | [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | +| [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | SummaryRetrieval | s2p | [Spoken] | None | None | | [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -476,6 +515,7 @@ The following tables give you an overview of the tasks in MTEB. | [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [SCIDOCS-Fa](https://huggingface.co/datasets/MCINext/scidocs-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | | [SDSGlovesClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | @@ -487,6 +527,8 @@ The following tables give you an overview of the tasks in MTEB. | [SICK-R](https://aclanthology.org/L14-1314/) | ['eng'] | STS | s2s | [Web, Written] | None | None | | [SICK-R-PL](https://aclanthology.org/2020.lrec-1.207) | ['pol'] | STS | s2s | [Web, Written] | None | None | | [SICKFr](https://huggingface.co/datasets/Lajavaness/SICK-fr) | ['fra'] | STS | s2s | | None | None | +| [SIDClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Academic] | None | None | +| [SIDClustring](https://www.sid.com/) | ['fas'] | Clustering | p2p | [Academic] | None | None | | [SIQA](https://leaderboard.allenai.org/socialiqa/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [SKQuadRetrieval](https://huggingface.co/datasets/TUKE-KEMT/retrieval-skquad) | ['slk'] | Retrieval | s2s | [Encyclopaedic] | None | None | | [SNLHierarchicalClusteringP2P](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Clustering | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | @@ -509,10 +551,12 @@ The following tables give you an overview of the tasks in MTEB. | [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Fiction, News, Non-fiction, Blog, Spoken, Web, Written] | None | None | | [SciDocsRR](https://allenai.org/data/scidocs) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | +| [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None | | [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Written] | None | None | | [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | +| [SentimentDKSF](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | | [SinhalaNewsSourceClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification) (Dhananjaya et al., 2022) | ['sin'] | Classification | s2s | [News, Written] | None | None | | [SiswatiNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['ssw'] | Classification | s2s | [News, Written] | None | None | @@ -542,6 +586,33 @@ The following tables give you an overview of the tasks in MTEB. | [SwednClusteringS2S](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Clustering | s2s | [News, Non-fiction, Written] | None | None | | [SwednRetrieval](https://spraakbanken.gu.se/en/resources/swedn) (Monsen et al., 2021) | ['swe'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | | [SwissJudgementClassification](https://aclanthology.org/2021.nllp-1.3/) (Joel Niklaus, 2022) | ['deu', 'fra', 'ita'] | Classification | s2s | [Legal, Written] | None | None | +| [SynPerChatbotConvSAAnger](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| SynPerChatbotConvSAClassification | ['fas'] | Classification | None | [Spoken] | None | None | +| [SynPerChatbotConvSAFear](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAFriendship](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAHappiness](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAJealousy](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSALove](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASadness](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASatisfaction](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSASurprise](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotConvSAToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGFAQPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Spoken] | None | None | +| [SynPerChatbotRAGFAQRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerChatbotRAGSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval) | ['fas'] | SummaryRetrieval | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerChatbotSatisfactionLevelClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval) | ['fas'] | SummaryRetrieval | p2p | [Spoken] | None | None | +| [SynPerChatbotToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | +| [SynPerChatbotTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | +| [SynPerQAPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Web, News, Religious, Blog] | None | None | +| [SynPerQARetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings) | ['fas'] | Retrieval | s2p | [Web] | None | None | +| [SynPerSTS](https://mcinext.com/) | ['fas'] | STS | s2s | [Web, News, Religious, Blog] | None | None | +| [SynPerTextKeywordsPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Web, News, Religious, Blog] | None | None | | [SyntecReranking](https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Reranking | s2p | [Legal, Written] | None | None | | [SyntecRetrieval](https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Retrieval | s2p | [Legal, Written] | None | None | | [SyntheticText2SQL](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql) (Meyer et al., 2024) | ['eng', 'sql'] | Retrieval | p2p | [Programming, Written] | {'test': 111702} | {'test': {'number_of_characters': 14041553, 'num_samples': 111702, 'num_queries': 5851, 'num_documents': 105851, 'min_document_length': 13, 'average_document_length': 4.58, 'max_document_length': 281, 'unique_documents': 105851, 'min_query_length': 17, 'average_query_length': 2316.95, 'max_query_length': 762, 'unique_queries': 5851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5851}} | @@ -550,6 +621,7 @@ The following tables give you an overview of the tasks in MTEB. | [TERRa](https://arxiv.org/pdf/2010.15925) (Shavrina et al., 2020) | ['rus'] | PairClassification | s2s | [News, Web, Written] | None | None | | [TNews](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | | [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None | | [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | | [TamilNewsClassification](https://github.com/vanangamudi/tamil-news-classification) (Anoop Kunchukuttan, 2020) | ['tam'] | Classification | s2s | [News, Written] | None | None | @@ -573,6 +645,7 @@ The following tables give you an overview of the tasks in MTEB. | [ThuNewsClusteringS2S.v2](http://thuctc.thunlp.org/) (Sun et al., 2016) | ['cmn'] | Clustering | s2s | [News, Written] | None | None | | [TopiOCQA](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [TopiOCQAHardNegatives](https://mcgill-nlp.github.io/topiocqa) (Vaibhav Adlakha, 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [Touche2020-Fa](https://huggingface.co/datasets/MCINext/touche2020-fa) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | | [Touche2020Retrieval.v3](https://github.com/castorini/touche-error-analysis) | ['eng'] | Retrieval | s2p | [Academic] | {'test': 303781} | {'test': {'number_of_characters': 637047138, 'num_samples': 303781, 'num_queries': 49, 'num_documents': 303732, 'min_document_length': 16, 'average_document_length': 0.01, 'max_document_length': 83, 'unique_documents': 303732, 'min_query_length': 41, 'average_query_length': 13000918.57, 'max_query_length': 105983, 'unique_queries': 49, 'min_relevant_docs_per_query': 40, 'average_relevant_docs_per_query': 58.14, 'max_relevant_docs_per_query': 87, 'unique_relevant_docs': 2732}} | | [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | | [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | @@ -650,1060 +723,1060 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | Sum | -|---|------|------|------|------|------|------|------|------|------|------|------|---| -| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 10 | -| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | -| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aom | Ömie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apn | Apinayé | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 29 | -| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 8 | -| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | -| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 7 | -| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 14 | -| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 28 | -| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | -| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkq | Bakairí | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 13 | -| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caa | Chortí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cao | Chácobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| cav | Cavineña | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 17 | -| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chz | Ozumacín Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| cjo | Ashéninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 44 | -| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 37 | -| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpc | Ajyíninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpu | Pichis Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpy | South Ucayali Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| daa | Dangaléat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 | -| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 59 | -| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | -| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 344 | -| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 8 | -| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | -| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 1 | 4 | 0 | 0 | 0 | 1 | 2 | 9 | 0 | 0 | 0 | 17 | -| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 | -| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 58 | -| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnw | Western Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gub | Guajajára | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gui | Eastern Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | -| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gun | Mbyá Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gux | Gourmanchéma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gym | Ngäbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | -| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 | -| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 39 | -| hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 12 | -| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 9 | -| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 12 | -| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 21 | -| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | -| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 13 | -| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 36 | -| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 19 | -| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 10 | -| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| kbc | Kadiwéu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbh | Camsá | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbp | Kabiyè | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| kek | Kekchí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgk | Kaiwá | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 8 | -| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kmg | Kâte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 30 | -| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpj | Karajá | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyz | Kayabí | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| maj | Jalapa De Díaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 19 | -| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maq | Chiquihuitlán Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 20 | -| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mav | Sateré-Mawé | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbj | Nadëb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbl | Maxakalí | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcf | Matsés | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mco | Coatlán Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mib | Atatláhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mil | Peñoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mop | Mopán Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpm | Yosondúa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | -| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxb | Tezoatlán Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myu | Mundurukú | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nab | Southern Nambikuára | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncl | Michoacán Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 20 | -| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nob | Norwegian Bokmål | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 19 | -| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 3 | -| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntu | Natügu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 15 | -| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otq | Querétaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ots | Estado de México Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pab | Parecís | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pad | Paumarí | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | -| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| plu | Palikúr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 | -| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 28 | -| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pri | Paicî | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qub | Huallaga Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qve | Eastern Apurímac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvn | North Junín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvs | San Martín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxh | Panao Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 18 | -| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 52 | -| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 | -| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 12 | -| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 44 | -| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 9 | -| srq | Sirionó | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | -| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 15 | -| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 22 | -| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 21 | -| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 24 | -| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 21 | -| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnc | Tanimuca-Retuarã | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| too | Xicotepec De Juárez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 20 | -| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| txu | Kayapó | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | -| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urb | Urubú-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 19 | -| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 18 | -| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | -| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xav | Xavánte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | -| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtm | Magdalena Peñasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 16 | -| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yre | Yaouré | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaa | Sierra de Juárez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zac | Ocotlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zam | Miahuatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zar | Rincón Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 21 | -| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zos | Francisco León Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpl | Lachixío Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpo | Amatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpu | Yalálag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | -| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1395 | 812 | 306 | 3 | 28 | 84 | 51 | 476 | 85 | 2 | 2 | +| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | SummaryRetrieval | Sum | +|---|------|------|------|------|------|------|------|------|------|------|------|------|---| +| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 10 | +| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 14 | +| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aom | Ömie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apn | Apinayé | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 0 | 29 | +| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 8 | +| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 7 | +| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 14 | +| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 28 | +| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkq | Bakairí | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 13 | +| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caa | Chortí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cao | Chácobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| cav | Cavineña | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 17 | +| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chz | Ozumacín Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| cjo | Ashéninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 0 | 44 | +| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 0 | 37 | +| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpc | Ajyíninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpu | Pichis Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpy | South Ucayali Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| daa | Dangaléat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 0 | 23 | +| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 0 | 59 | +| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 16 | +| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 0 | 344 | +| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 8 | +| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fas | Persian | Indo-European | 1 | 28 | 5 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 3 | 90 | +| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 0 | 19 | +| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 0 | 58 | +| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnw | Western Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gub | Guajajára | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gui | Eastern Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 18 | +| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gun | Mbyá Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gux | Gourmanchéma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gym | Ngäbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 14 | +| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 11 | +| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 39 | +| hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | +| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 12 | +| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12 | +| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 0 | 21 | +| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 0 | 27 | +| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 13 | +| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 0 | 36 | +| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 19 | +| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 10 | +| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| kbc | Kadiwéu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbh | Camsá | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbp | Kabiyè | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| kek | Kekchí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgk | Kaiwá | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 8 | +| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kmg | Kâte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 0 | 30 | +| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpj | Karajá | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyz | Kayabí | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| maj | Jalapa De Díaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 19 | +| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maq | Chiquihuitlán Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 20 | +| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mav | Sateré-Mawé | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbj | Nadëb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbl | Maxakalí | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcf | Matsés | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mco | Coatlán Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mib | Atatláhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mil | Peñoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mop | Mopán Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpm | Yosondúa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxb | Tezoatlán Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myu | Mundurukú | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nab | Southern Nambikuára | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncl | Michoacán Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 0 | 20 | +| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nob | Norwegian Bokmål | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 19 | +| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 3 | +| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntu | Natügu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 15 | +| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otq | Querétaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ots | Estado de México Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pab | Parecís | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pad | Paumarí | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 18 | +| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| plu | Palikúr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 0 | 46 | +| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 0 | 28 | +| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pri | Paicî | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qub | Huallaga Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qve | Eastern Apurímac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvn | North Junín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvs | San Martín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxh | Panao Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 18 | +| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 0 | 52 | +| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | +| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 12 | +| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | +| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 0 | 44 | +| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 9 | +| srq | Sirionó | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | +| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 15 | +| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 0 | 22 | +| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 21 | +| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 24 | +| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 21 | +| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnc | Tanimuca-Retuarã | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| too | Xicotepec De Juárez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 0 | 20 | +| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| txu | Kayapó | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | +| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urb | Urubú-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 19 | +| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | +| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 18 | +| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | +| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xav | Xavánte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | +| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtm | Magdalena Peñasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 16 | +| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yre | Yaouré | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaa | Sierra de Juárez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zac | Ocotlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zam | Miahuatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zar | Rincón Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 0 | 21 | +| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zos | Francisco León Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpl | Lachixío Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpo | Amatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpu | Yalálag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | +| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | +| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| Total | None | None | None | 1395 | 836 | 311 | 3 | 28 | 91 | 51 | 507 | 88 | 2 | 2 | 3 |
From 7e5d6c81daa2512b1c257e645cd0de122184acc0 Mon Sep 17 00:00:00 2001 From: Roman Solomatin Date: Thu, 30 Jan 2025 13:49:10 +0300 Subject: [PATCH 051/275] Docs: update docs according to current state (#1870) * update docs * Apply suggestions from code review Co-authored-by: Isaac Chung * update readme * Update README.md Co-authored-by: Isaac Chung --------- Co-authored-by: Isaac Chung --- README.md | 22 +++--- docs/adding_a_benchmark.md | 7 ++ docs/adding_a_leaderboard_tab.md | 15 ----- docs/adding_a_model.md | 112 ++++++++++++++++++++++--------- 4 files changed, 97 insertions(+), 59 deletions(-) create mode 100644 docs/adding_a_benchmark.md delete mode 100644 docs/adding_a_leaderboard_tab.md diff --git a/README.md b/README.md index daf715f029..59cc5da9e2 100644 --- a/README.md +++ b/README.md @@ -472,24 +472,24 @@ evaluation.run(model, ...) ## Documentation -| Documentation | | -| ------------------------------ | ---------------------- | -| 📋 [Tasks] | Overview of available tasks | -| 📐 [Benchmarks] | Overview of available benchmarks | -| 📈 [Leaderboard] | The interactive leaderboard of the benchmark | -| 🤖 [Adding a model] | Information related to how to submit a model to the leaderboard | +| Documentation | | +|--------------------------------|-------------------------------------------------------------------------------------| +| 📋 [Tasks] | Overview of available tasks | +| 📐 [Benchmarks] | Overview of available benchmarks | +| 📈 [Leaderboard] | The interactive leaderboard of the benchmark | +| 🤖 [Adding a model] | Information related to how to submit a model to MTEB and to the leaderboard | | 👩‍🔬 [Reproducible workflows] | Information related to how to reproduce and create reproducible workflows with MTEB | -| 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB |  -| 👩‍💻 [Adding a leaderboard tab] | How to add a new leaderboard tab to MTEB |  -| 🤝 [Contributing] | How to contribute to MTEB and set it up for development | -| 🌐 [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages |   +| 👩‍💻 [Adding a dataset] | How to add a new task/dataset to MTEB | +| 👩‍💻 [Adding a benchmark] | How to add a new benchmark to MTEB and to the leaderboard | +| 🤝 [Contributing] | How to contribute to MTEB and set it up for development | +| 🌐 [MMTEB] | An open-source effort to extend MTEB to cover a broad set of languages | [Tasks]: docs/tasks.md [Benchmarks]: docs/benchmarks.md [Contributing]: CONTRIBUTING.md [Adding a model]: docs/adding_a_model.md [Adding a dataset]: docs/adding_a_dataset.md -[Adding a leaderboard tab]: docs/adding_a_leaderboard_tab.md +[Adding a benchmark]: docs/adding_a_benchmark.md [Leaderboard]: https://huggingface.co/spaces/mteb/leaderboard [MMTEB]: docs/mmteb/readme.md [Reproducible workflows]: docs/reproducible_workflow.md diff --git a/docs/adding_a_benchmark.md b/docs/adding_a_benchmark.md new file mode 100644 index 0000000000..56a042fdb9 --- /dev/null +++ b/docs/adding_a_benchmark.md @@ -0,0 +1,7 @@ +## Adding a benchmark + +The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we encourage additions of new benchmarks. To add a new benchmark: + +1. Add your benchmark to [benchmark.py](../mteb/benchmarks/benchmarks.py) as a `Benchmark` object, and select the MTEB tasks that will be in the benchmark. If some of the tasks do not exist in MTEB, follow the "add a dataset" instructions to add them. +2. Open a PR at https://github.com/embedding-benchmark/results with results of models on your benchmark. +3. When PRs are merged, your benchmark will be added to the leaderboard automatically after the next workflow trigger. \ No newline at end of file diff --git a/docs/adding_a_leaderboard_tab.md b/docs/adding_a_leaderboard_tab.md deleted file mode 100644 index 260293ed5c..0000000000 --- a/docs/adding_a_leaderboard_tab.md +++ /dev/null @@ -1,15 +0,0 @@ -## Adding a new Leaderboard tab - -The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we love new leaderboard tabs. To add a new leaderboard tab: - -1. Open a PR in https://hf.co/datasets/mteb/results with: -- All results added in existing model folders or new folders -- Updated paths.json (see snippet results.py) -- If adding any new models, their names added to results.py -- If you have access to all models you are adding, you can also [add results via the metadata](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md) for all of them / some of them -2. Open a PR at https://huggingface.co/spaces/mteb/leaderboard modifying app.py to add your tab: -- Add any new models & their specs to the global lists -- Add your tab, credits etc to where the other tabs are defined -- If you're adding new results to existing models, remove those models from `EXTERNAL_MODEL_RESULTS.json` such that they can be reloaded with the new results and are not cached. -- You may also have to uncomment `, download_mode='force_redownload', verification_mode="no_checks")` where the datasets are loaded to experiment locally without caching of results -- Test that it runs & works locally as you desire with python app.py, **please add screenshots to the PR** diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index 314c6e9c39..088199e264 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -2,7 +2,63 @@ The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard). To submit to it: -1. **Run the desired model on MTEB:** +1. **Add meta information about your model to [model dir](../mteb/models/)**. + ```python + from mteb.model_meta import ModelMeta + + bge_m3 = ModelMeta( + name="model_name", + languages=["model_languages"], # in format eng-Latn + open_weights=True, + revision="5617a9f61b028005a4858fdac845db406aefb181", + release_date="2024-06-28", + n_parameters=568_000_000, + embed_dim=4096, + license="mit", + max_tokens=8194, + reference="https://huggingface.co/BAAI/bge-m3", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=False, + public_training_code=None, + public_training_data="https://huggingface.co/datasets/cfli/bge-full-data", + training_datasets={"your_dataset": ["train"]}, + ) + ``` + By default, the model will run using the [`sentence_transformers_loader`](../mteb/models/sentence_transformer_wrapper.py) loader function. If you need to use a custom implementation, you can specify the `loader` parameter in the `ModelMeta` class. For example: + ```python + from mteb.models.wrapper import Wrapper + from mteb.encoder_interface import PromptType + import numpy as np + + class CustomWrapper(Wrapper): + def __init__(self, model_name, model_revision): + super().__init__(model_name, model_revision) + # your custom implementation here + + def encode( + self, + sentences: list[str], + *, + task_name: str, + prompt_type: PromptType | None = None, + **kwargs + ) -> np.ndarray: + # your custom implementation here + return np.zeros((len(sentences), self.embed_dim)) + ``` + Then you can specify the `loader` parameter in the `ModelMeta` class: + ```python + your_model = ModelMeta( + loader=partial( + CustomWrapper, + model_name="model_name", + model_revision="5617a9f61b028005a4858fdac845db406aefb181" + ), + ... + ) + ``` +2. **Run the desired model on MTEB:** Either use the Python API: @@ -32,45 +88,35 @@ These will save the results in a folder called `results/{model_name}/{model_revi To add results to the public leaderboard you can push your results to the [results repository](https://github.com/embeddings-benchmark/results) via a PR. Once merged they will appear on the leaderboard after a day. - -3. (Optional) **Add results to the model card:** - -`mteb` implements a cli for adding results to the model card: - -```bash -mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md -``` - -To add the content to the public model simply copy the content of the `model_card.md` file to the top of a `README.md` file of your model on the Hub. See [here](https://huggingface.co/Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit/blob/main/README.md) for an example. - -If the readme already exists: - -```bash -mteb create_meta --results_folder results/{model_name}/{model_revision} --output_path model_card.md --from_existing your_existing_readme.md -``` - -Note that running the model on many tasks may lead to a huge readme front matter. - -4. **Wait for a refresh the leaderboard:** - -The leaderboard [automatically refreshes daily](https://github.com/embeddings-benchmark/leaderboard/commits/main/) so once submitted you only need to wait for the automatic refresh. You can find the workflows for the leaderboard refresh [here](https://github.com/embeddings-benchmark/leaderboard/tree/main/.github/workflows). If you experience issues with the leaderboard please create an [issue](https://github.com/embeddings-benchmark/mteb/issues). +3. **Wait for a refresh the leaderboard** **Notes:** -- We remove models with scores that cannot be reproduced, so please ensure that your model is accessible and scores can be reproduced. -- ##### Using Prompts with Sentence Transformers +##### Using Prompts with Sentence Transformers - If your model uses Sentence Transformers and requires different prompts for encoding the queries and corpus, you can take advantage of the `prompts` [parameter](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer). - - Internally, `mteb` uses the prompt named `query` for encoding the queries and `passage` as the prompt name for encoding the corpus. This is aligned with the default names used by Sentence Transformers. +If your model uses Sentence Transformers and requires different prompts for encoding the queries and corpus, you can take advantage of the `prompts` [parameter](https://sbert.net/docs/package_reference/sentence_transformer/SentenceTransformer.html#sentence_transformers.SentenceTransformer). - ###### Adding the prompts in the model configuration (Preferred) +Internally, `mteb` uses `query` for encoding the queries and `passage` as the prompt names for encoding the corpus. This is aligned with the default names used by Sentence Transformers. - You can directly add the prompts when saving and uploading your model to the Hub. For an example, refer to this [configuration file](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json). +###### Adding the prompts in the model configuration (Preferred) - ###### Instantiating the Model with Prompts +You can directly add the prompts when saving and uploading your model to the Hub. For an example, refer to this [configuration file](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5/blob/3b5a16eaf17e47bd997da998988dce5877a57092/config_sentence_transformers.json). These prompts can then be specified in the ModelMeta object. - If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. + +```python +model = ModelMeta( + loader=partial( # type: ignore + sentence_transformers_loader, + model_name="intfloat/multilingual-e5-small", + revision="fd1525a9fd15316a2d503bf26ab031a61d056e98", + model_prompts={ + "query": "query: ", + "passage": "passage: ", + }, + ), +) +``` +If you are unable to directly add the prompts in the model configuration, you can instantiate the model using the `sentence_transformers_loader` and pass `prompts` as an argument. For more details, see the `mteb/models/bge_models.py` file. ##### Adding instruction models @@ -85,4 +131,4 @@ model = ModelMeta( ), ... ) -``` \ No newline at end of file +``` From 7996458822e8f8f8de05e4c3a43c609e026b25b1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 10:51:32 +0000 Subject: [PATCH 052/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index cee6247578..a9467a95f0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6cc0560305caf82d9a3bc876ee2d57bedb08cb0d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 10:55:22 +0000 Subject: [PATCH 053/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a9467a95f0..66a80fd610 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f258cfc96063531aeb2a4e1e13a1e21b43ad04d2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 10:57:30 +0000 Subject: [PATCH 054/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 66a80fd610..c62d5b1a59 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d0bb5b9753f36de826f45b956990d422757f24be Mon Sep 17 00:00:00 2001 From: Wissam Siblini <36303760+wissam-sib@users.noreply.github.com> Date: Thu, 30 Jan 2025 15:28:08 +0100 Subject: [PATCH 055/275] Adding a banner to the new MMTEB leaderboard (#1908) * Adding a banner to the new MMTEB leaderboard * linting * Update mteb/leaderboard/app.py Co-authored-by: Isaac Chung * adding reference to mteb arena --------- Co-authored-by: Isaac Chung --- mteb/leaderboard/app.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 483b681e23..dc6eb74ab3 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -234,6 +234,12 @@ def filter_models( """ with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + gr.Markdown(""" + ## MMTEB: Massive Multilingual Text Embedding Benchmark + + The MMTEB leaderboard compares text embedding models on 1000+ languages. Check out the [paper](https://openreview.net/pdf?id=zl3pfz4VCV) for details on datasets, languages and tasks. And you can contribute! 🤗 To add a model, please refer to the documentation in the [GitHub repository](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md). Also check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ + """) + with gr.Row(): with gr.Column(scale=5): gr.Markdown( From 77681bf0900bed8b29688ade2a310b4ea7142e61 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 14:30:09 +0000 Subject: [PATCH 056/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c62d5b1a59..cee6247578 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2850a9706752d0cf341f674614a46880da6bea65 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 14:32:15 +0000 Subject: [PATCH 057/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index cee6247578..c62d5b1a59 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 28ad1723b5750f5b22b28783e1e57e38508f5498 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 14:34:42 +0000 Subject: [PATCH 058/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c62d5b1a59..a9467a95f0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2e817b03c0e863543091581096392e8d9ccd0025 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 14:37:06 +0000 Subject: [PATCH 059/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a9467a95f0..21b4036457 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b7e412d5e29912305c0ef1dd0b34b8aeef7af8c7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 14:39:57 +0000 Subject: [PATCH 060/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 21b4036457..a9467a95f0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 938e90f58714c525157d968a278ae3b07fc7b20a Mon Sep 17 00:00:00 2001 From: Imene Kerboua <33312980+imenelydiaker@users.noreply.github.com> Date: Thu, 30 Jan 2025 22:05:23 +0100 Subject: [PATCH 061/275] fix: Filling missing metadata for leaderboard release (#1895) * Update ArxivClusteringS2S.py * fill some metadat for retrieval * fill in the reste of missing metadata * fix metadata * fix climatefever metadata * fix: Added CQADupstack annotations * removed annotation for non-exisitant task * format * Added financial to other financial dataset * Moved ArguAna annotation to derivate datasets --------- Co-authored-by: Kenneth Enevoldsen --- mteb/abstasks/TaskMetadata.py | 2 + mteb/models/arctic_models.py | 3 +- mteb/models/e5_instruct.py | 1 - mteb/models/e5_models.py | 1 - mteb/models/gritlm_models.py | 1 - mteb/models/salesforce_models.py | 1 - .../eng/FinancialPhrasebankClassification.py | 2 +- mteb/tasks/Classification/kor/KorFin.py | 2 +- .../Clustering/eng/ArxivClusteringS2S.py | 14 +++--- mteb/tasks/Clustering/eng/RedditClustering.py | 15 +++--- .../Clustering/eng/RedditClusteringP2P.py | 15 +++--- .../Clustering/eng/StackExchangeClustering.py | 15 +++--- .../eng/StackExchangeClusteringP2P.py | 14 +++--- .../eng/TwitterSemEval2015PC.py | 12 ++--- .../eng/TwitterURLCorpusPC.py | 12 ++--- .../Reranking/eng/AskUbuntuDupQuestions.py | 8 ++-- .../eng/StackOverflowDupQuestions.py | 14 +++--- .../eng/CQADupstackAndroidRetrieval.py | 12 ++--- .../eng/CQADupstackEnglishRetrieval.py | 12 ++--- .../eng/CQADupstackGamingRetrieval.py | 12 ++--- .../Retrieval/eng/CQADupstackGisRetrieval.py | 12 ++--- .../eng/CQADupstackMathematicaRetrieval.py | 12 ++--- .../eng/CQADupstackPhysicsRetrieval.py | 12 ++--- .../eng/CQADupstackProgrammersRetrieval.py | 2 +- .../eng/CQADupstackStatsRetrieval.py | 12 ++--- .../Retrieval/eng/CQADupstackTexRetrieval.py | 12 ++--- .../Retrieval/eng/CQADupstackUnixRetrieval.py | 12 ++--- .../eng/CQADupstackWebmastersRetrieval.py | 12 ++--- .../eng/CQADupstackWordpressRetrieval.py | 12 ++--- .../Retrieval/eng/ClimateFEVERRetrieval.py | 24 +++++----- mteb/tasks/Retrieval/eng/FEVERRetrieval.py | 12 ++--- mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py | 12 ++--- mteb/tasks/Retrieval/eng/MSMARCORetrieval.py | 46 ++++++++++++++----- .../tasks/Retrieval/eng/MSMARCOv2Retrieval.py | 23 +++++++--- mteb/tasks/Retrieval/eng/NQRetrieval.py | 12 ++--- mteb/tasks/Retrieval/eng/QuoraRetrieval.py | 12 ++--- mteb/tasks/Retrieval/eng/SciFactRetrieval.py | 4 +- mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py | 2 +- .../tasks/Retrieval/pol/ArguAnaPLRetrieval.py | 6 +-- mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py | 12 ++--- mteb/tasks/STS/eng/BiossesSTS.py | 12 ++--- mteb/tasks/STS/eng/STSBenchmarkSTS.py | 12 ++--- 42 files changed, 242 insertions(+), 213 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index 248c81ac6f..df6a48c1f0 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -71,6 +71,7 @@ "Written", "Programming", "Chemistry", + "Financial", ] SAMPLE_CREATION_METHOD = Literal[ @@ -171,6 +172,7 @@ "gpl-3.0", "cdla-sharing-1.0", "mpl-2.0", + "msr-la-nc", "multiple", ] ) diff --git a/mteb/models/arctic_models.py b/mteb/models/arctic_models.py index f765b01bff..e92c1ca098 100644 --- a/mteb/models/arctic_models.py +++ b/mteb/models/arctic_models.py @@ -110,7 +110,8 @@ # in MTEB "NQ": ["test"], "NQHardNegatives": ["test"], - "HotPotQA": ["test"], + "NQ-PL": ["test"], + "HotPotQA": ["test"], # translated, not trained on "HotPotQAHardNegatives": ["test"], "HotPotQA-PL": ["test"], # translated from hotpotQA (not trained on) "FEVER": ["test"], diff --git a/mteb/models/e5_instruct.py b/mteb/models/e5_instruct.py index 3eed189d33..3c18f9c27a 100644 --- a/mteb/models/e5_instruct.py +++ b/mteb/models/e5_instruct.py @@ -19,7 +19,6 @@ **E5_TRAINING_DATA, "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/e5_models.py b/mteb/models/e5_models.py index 0ad15e7320..94d04ee483 100644 --- a/mteb/models/e5_models.py +++ b/mteb/models/e5_models.py @@ -130,7 +130,6 @@ **E5_TRAINING_DATA, "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/gritlm_models.py b/mteb/models/gritlm_models.py index d15c1f4a55..1e3a0b42bd 100644 --- a/mteb/models/gritlm_models.py +++ b/mteb/models/gritlm_models.py @@ -16,7 +16,6 @@ # also uses medi2 which contains fever and hotpotqa: "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/models/salesforce_models.py b/mteb/models/salesforce_models.py index 235057a6f8..8c72265cc9 100644 --- a/mteb/models/salesforce_models.py +++ b/mteb/models/salesforce_models.py @@ -22,7 +22,6 @@ def instruction_template( "FiQA2018-PL": ["train"], "FEVER": ["train"], "FEVERHardNegatives": ["train"], - "FEVER-PL": ["train"], # translation not trained on "HotpotQA": ["train"], "HotpotQAHardNegatives": ["train"], "HotpotQA-PL": ["train"], # translation not trained on diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py index 6ddb37c42a..b9abb5445a 100644 --- a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py +++ b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py @@ -22,7 +22,7 @@ class FinancialPhrasebankClassification(AbsTaskClassification): eval_langs=["eng-Latn"], main_score="accuracy", date=("2013-11-01", "2013-11-01"), - domains=["News", "Written"], + domains=["News", "Written", "Financial"], task_subtypes=["Sentiment/Hate speech"], license="cc-by-nc-sa-3.0", annotations_creators="expert-annotated", diff --git a/mteb/tasks/Classification/kor/KorFin.py b/mteb/tasks/Classification/kor/KorFin.py index a22b7d5cfe..1fdfb47694 100644 --- a/mteb/tasks/Classification/kor/KorFin.py +++ b/mteb/tasks/Classification/kor/KorFin.py @@ -25,7 +25,7 @@ class KorFin(AbsTaskClassification): "2022-01-01", "2022-12-31", ), # Assumed date based on the citations in the paper - domains=["News", "Written"], + domains=["News", "Written", "Financial"], task_subtypes=["Sentiment/Hate speech"], license="cc-by-sa-4.0", annotations_creators="expert-annotated", diff --git a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py index c74766061d..8b4beb0e26 100644 --- a/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py +++ b/mteb/tasks/Clustering/eng/ArxivClusteringS2S.py @@ -21,13 +21,13 @@ class ArxivClusteringS2S(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("1991-01-01", "2021-01-01"), # 1991-01-01 is the first arxiv paper + domains=["Academic", "Written"], + task_subtypes=[], + license="cc0-1.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{arxiv_org_submitters_2024, title={arXiv Dataset}, url={https://www.kaggle.com/dsv/7548853}, diff --git a/mteb/tasks/Clustering/eng/RedditClustering.py b/mteb/tasks/Clustering/eng/RedditClustering.py index c9efbe954a..84c6602c63 100644 --- a/mteb/tasks/Clustering/eng/RedditClustering.py +++ b/mteb/tasks/Clustering/eng/RedditClustering.py @@ -85,14 +85,13 @@ class RedditClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", # derived from pushshift + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py index 1e8d51cdfa..fc74844a2e 100644 --- a/mteb/tasks/Clustering/eng/RedditClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/RedditClusteringP2P.py @@ -29,14 +29,13 @@ class RedditClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", # derived from pushshift + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/StackExchangeClustering.py b/mteb/tasks/Clustering/eng/StackExchangeClustering.py index b123ab5bd1..c495b10de4 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClustering.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClustering.py @@ -87,14 +87,13 @@ class StackExchangeClustering(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - form=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py index c411138e9f..a06eb82ae9 100644 --- a/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py +++ b/mteb/tasks/Clustering/eng/StackExchangeClusteringP2P.py @@ -91,13 +91,13 @@ class StackExchangeClusteringP2P(AbsTaskClustering): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="v_measure", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2021-01-01", "2021-04-14"), + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{geigle:2021:arxiv, author = {Gregor Geigle and Nils Reimers and diff --git a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py index b8bc686d87..9da7c1072e 100644 --- a/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py +++ b/mteb/tasks/PairClassification/eng/TwitterSemEval2015PC.py @@ -21,12 +21,12 @@ class TwitterSemEval2015PC(AbsTaskPairClassification): eval_langs=["eng-Latn"], main_score="max_ap", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{xu-etal-2015-semeval, title = "{S}em{E}val-2015 Task 1: Paraphrase and Semantic Similarity in {T}witter ({PIT})", author = "Xu, Wei and diff --git a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py index 24839e5938..85432b1d97 100644 --- a/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py +++ b/mteb/tasks/PairClassification/eng/TwitterURLCorpusPC.py @@ -21,12 +21,12 @@ class TwitterURLCorpusPC(AbsTaskPairClassification): eval_langs=["eng-Latn"], main_score="max_ap", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{lan-etal-2017-continuously, title = "A Continuously Growing Dataset of Sentential Paraphrases", author = "Lan, Wuwei and diff --git a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py index 90fe689cdd..b9dfde0055 100644 --- a/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py +++ b/mteb/tasks/Reranking/eng/AskUbuntuDupQuestions.py @@ -21,12 +21,12 @@ class AskUbuntuDupQuestions(AbsTaskReranking): eval_langs=["eng-Latn"], main_score="map", date=None, - domains=None, + domains=["Programming", "Web"], task_subtypes=None, license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", prompt="Retrieve duplicate questions from AskUbuntu forum", bibtex_citation="""@article{wang-2021-TSDAE, title = "TSDAE: Using Transformer-based Sequential Denoising Auto-Encoderfor Unsupervised Sentence Embedding Learning", diff --git a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py index 9e47461620..897f9d7bc9 100644 --- a/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py +++ b/mteb/tasks/Reranking/eng/StackOverflowDupQuestions.py @@ -20,13 +20,13 @@ class StackOverflowDupQuestions(AbsTaskReranking): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="map", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2014-01-21", "2018-01-01"), + domains=["Written", "Blog", "Programming"], + task_subtypes=["Question answering"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", prompt="Retrieve duplicate questions from StackOverflow forum", bibtex_citation="""@article{Liu2018LinkSOAD, title={LinkSO: a dataset for learning to retrieve similar question answer pairs on software development forums}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py index b95c61af47..156395a077 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackAndroidRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackAndroidRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Programming", "Web", "Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py index d9f1c1f344..af47eda5c4 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackEnglishRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackEnglishRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py index 8c89299957..b51a3e64b5 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGamingRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackGamingRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Web", "Written"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py index 8ed296b003..da38284f2d 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackGisRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackGisRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py index 0d1804e5e7..b29d166129 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackMathematicaRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackMathematicaRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py index 77402252f9..3dd0fdc4a5 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackPhysicsRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackPhysicsRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py index 1fa63dd20a..f84b1b17e4 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackProgrammersRetrieval.py @@ -23,7 +23,7 @@ class CQADupstackProgrammersRetrieval(AbsTaskRetrieval): date=None, domains=["Programming", "Written", "Non-fiction"], task_subtypes=[], - license="cc-by-sa-4.0", + license="apache-2.0", annotations_creators="derived", dialect=[], sample_creation="found", diff --git a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py index 8b2ee5950a..1fd18f8d84 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackStatsRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackStatsRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py index 2e87f49710..c4447442be 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackTexRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackTexRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py index f86d886519..57c9964b15 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackUnixRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackUnixRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering", "Duplicate Detection"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py index eedacec19a..2e9bd63e08 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWebmastersRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackWebmastersRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web"], + task_subtypes=["Question answering"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py index e70255c371..3b11866f82 100644 --- a/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py +++ b/mteb/tasks/Retrieval/eng/CQADupstackWordpressRetrieval.py @@ -21,12 +21,12 @@ class CQADupstackWordpressRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering"], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{hoogeveen2015, author = {Hoogeveen, Doris and Verspoor, Karin M. and Baldwin, Timothy}, title = {CQADupStack: A Benchmark Data Set for Community Question-Answering Research}, diff --git a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py index d60b7a3817..b87e5223e0 100644 --- a/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/ClimateFEVERRetrieval.py @@ -21,12 +21,12 @@ class ClimateFEVER(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{diggelmann2021climatefever, title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, @@ -57,12 +57,12 @@ class ClimateFEVERHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{diggelmann2021climatefever, title={CLIMATE-FEVER: A Dataset for Verification of Real-World Climate Claims}, author={Thomas Diggelmann and Jordan Boyd-Graber and Jannis Bulian and Massimiliano Ciaramita and Markus Leippold}, diff --git a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py index 776fd2fbe6..fff60a54d2 100644 --- a/mteb/tasks/Retrieval/eng/FEVERRetrieval.py +++ b/mteb/tasks/Retrieval/eng/FEVERRetrieval.py @@ -27,12 +27,12 @@ class FEVER(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{thorne-etal-2018-fever, title = "{FEVER}: a Large-scale Dataset for Fact Extraction and {VER}ification", author = "Thorne, James and diff --git a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py index 1489cd168c..7a99d48a95 100644 --- a/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py +++ b/mteb/tasks/Retrieval/eng/FiQA2018Retrieval.py @@ -23,12 +23,12 @@ class FiQA2018(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{ thakur2021beir, title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, diff --git a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py index 5ada0cf887..6ebb5d7277 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCORetrieval.py @@ -23,12 +23,23 @@ class MSMARCO(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and @@ -73,12 +84,23 @@ class MSMARCOHardNegatives(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and diff --git a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py index d3b10738cf..7487abb887 100644 --- a/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py +++ b/mteb/tasks/Retrieval/eng/MSMARCOv2Retrieval.py @@ -21,12 +21,23 @@ class MSMARCOv2(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + license="msr-la-nc", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{DBLP:journals/corr/NguyenRSGTMD16, author = {Tri Nguyen and Mir Rosenberg and diff --git a/mteb/tasks/Retrieval/eng/NQRetrieval.py b/mteb/tasks/Retrieval/eng/NQRetrieval.py index 661bf3e0e2..85e45e832c 100644 --- a/mteb/tasks/Retrieval/eng/NQRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NQRetrieval.py @@ -21,12 +21,12 @@ class NQ(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Encyclopaedic"], + task_subtypes=["Question answering"], + license="cc-by-nc-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@article{47761,title = {Natural Questions: a Benchmark for Question Answering Research}, author = {Tom Kwiatkowski and Jennimaria Palomaki and Olivia Redfield and Michael Collins and Ankur Parikh and Chris Alberti and Danielle Epstein and Illia Polosukhin and Matthew Kelcey and Jacob Devlin and Kenton Lee diff --git a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py index 73660fb573..52e6cca4b1 100644 --- a/mteb/tasks/Retrieval/eng/QuoraRetrieval.py +++ b/mteb/tasks/Retrieval/eng/QuoraRetrieval.py @@ -26,12 +26,12 @@ class QuoraRetrieval(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Web", "Blog"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{quora-question-pairs, author = {DataCanary, hilfialkaff, Lili Jiang, Meg Risdal, Nikhil Dandekar, tomtung}, title = {Quora Question Pairs}, diff --git a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py index 8caa0c2af5..a44eb052bd 100644 --- a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py @@ -22,8 +22,8 @@ class SciFact(AbsTaskRetrieval): main_score="ndcg_at_10", date=None, domains=["Academic", "Medical", "Written"], - task_subtypes=None, - license=None, + task_subtypes=[], + license="not specified", annotations_creators=None, dialect=None, sample_creation=None, diff --git a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py index 4a24e04e9c..6eec67aad2 100644 --- a/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py +++ b/mteb/tasks/Retrieval/kor/AutoRAGRetrieval.py @@ -22,7 +22,7 @@ class AutoRAGRetrieval(AbsTaskRetrieval): eval_langs=["kor-Hang"], main_score="ndcg_at_10", date=("2024-08-03", "2024-08-03"), - domains=["Government", "Medical", "Legal", "Social"], + domains=["Government", "Medical", "Legal", "Social", "Financial"], task_subtypes=["Article retrieval"], license="mit", annotations_creators="human-annotated", diff --git a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py index 342f727144..ada5c4ca8e 100644 --- a/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/ArguAnaPLRetrieval.py @@ -24,11 +24,11 @@ class ArguAnaPL(AbsTaskRetrieval): eval_langs=["pol-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Medical", "Written"], task_subtypes=None, - license=None, + license="cc-by-sa-4.0", annotations_creators=None, - dialect=None, + dialect=[], sample_creation=None, bibtex_citation="""@misc{wojtasik2024beirpl, title={BEIR-PL: Zero Shot Information Retrieval Benchmark for the Polish Language}, diff --git a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py index 0a125f5e4f..b54f4ae4ed 100644 --- a/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/FiQAPLRetrieval.py @@ -24,12 +24,12 @@ class FiQAPLRetrieval(AbsTaskRetrieval): eval_langs=["pol-Latn"], main_score="ndcg_at_10", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@inproceedings{ thakur2021beir, title={{BEIR}: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models}, diff --git a/mteb/tasks/STS/eng/BiossesSTS.py b/mteb/tasks/STS/eng/BiossesSTS.py index ce54e37789..1fc1d5a1d0 100644 --- a/mteb/tasks/STS/eng/BiossesSTS.py +++ b/mteb/tasks/STS/eng/BiossesSTS.py @@ -21,12 +21,12 @@ class BiossesSTS(AbsTaskSTS): eval_langs=["eng-Latn"], main_score="cosine_spearman", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Medical"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", bibtex_citation="""@article{10.1093/bioinformatics/btx238, author = {Soğancıoğlu, Gizem and Öztürk, Hakime and Özgür, Arzucan}, title = "{BIOSSES: a semantic sentence similarity estimation system for the biomedical domain}", diff --git a/mteb/tasks/STS/eng/STSBenchmarkSTS.py b/mteb/tasks/STS/eng/STSBenchmarkSTS.py index 099fba6773..e600711d34 100644 --- a/mteb/tasks/STS/eng/STSBenchmarkSTS.py +++ b/mteb/tasks/STS/eng/STSBenchmarkSTS.py @@ -21,12 +21,12 @@ class STSBenchmarkSTS(AbsTaskSTS): eval_langs=["eng-Latn"], main_score="cosine_spearman", date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + domains=["Blog", "News", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="human-annotated", + dialect=[], + sample_creation="machine-translated and verified", bibtex_citation="""@InProceedings{huggingface:dataset:stsb_multi_mt, title = {Machine translated multilingual STS benchmark dataset.}, author={Philip May}, From 6989fd5f036e0d0c08c542e5458546e0a64ea03e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:07:32 +0000 Subject: [PATCH 062/275] Update tasks table --- docs/tasks.md | 64 +++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index a9467a95f0..4400a96f77 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -31,14 +31,14 @@ The following tables give you an overview of the tasks in MTEB. | [ArXivHierarchicalClusteringS2S](https://www.kaggle.com/Cornell-University/arxiv) | ['eng'] | Clustering | p2p | [Academic, Written] | None | None | | [ArguAna](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | | [ArguAna-Fa](https://huggingface.co/datasets/MCINext/arguana-fa) | ['fas'] | Retrieval | s2p | [Blog] | None | None | -| [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | +| [ArguAna-PL](https://huggingface.co/datasets/clarin-knext/arguana-pl) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Medical, Written] | None | None | | [ArmenianParaphrasePC](https://github.com/ivannikov-lab/arpa-paraphrase-corpus) (Arthur Malajyan, 2020) | ['hye'] | PairClassification | s2s | [News, Written] | None | None | | [ArxivClassification](https://ieeexplore.ieee.org/document/8675939) (He et al., 2019) | ['eng'] | Classification | s2s | [Academic, Written] | None | None | -| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | +| [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | | [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | None | None | | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | -| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Government, Medical, Legal, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | -| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | | None | None | +| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Government, Medical, Legal, Social, Financial] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | +| [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | | [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | @@ -69,31 +69,31 @@ The following tables give you an overview of the tasks in MTEB. | [CMedQAv2-reranking](https://github.com/zhangsheng93/cMedQA2) (S. Zhang, 2018) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None | | [COIRCodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1056326} | {'test': {'number_of_characters': 36843313, 'num_samples': 1056326, 'num_queries': 52561, 'num_documents': 1003765, 'min_document_length': 54, 'average_document_length': 34.71, 'max_document_length': 334374, 'unique_documents': 1003765, 'min_query_length': 2, 'average_query_length': 38.19, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 14574651, 'num_samples': 295228, 'num_queries': 14918, 'num_documents': 280310, 'min_document_length': 95, 'average_document_length': 49.99, 'max_document_length': 14008, 'unique_documents': 280310, 'min_query_length': 2, 'average_query_length': 37.58, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 2587540, 'num_samples': 68145, 'num_queries': 3291, 'num_documents': 64854, 'min_document_length': 87, 'average_document_length': 37.9, 'max_document_length': 334374, 'unique_documents': 64854, 'min_query_length': 2, 'average_query_length': 39.41, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 3641108, 'num_samples': 190562, 'num_queries': 8122, 'num_documents': 182440, 'min_document_length': 54, 'average_document_length': 17.96, 'max_document_length': 5280, 'unique_documents': 182440, 'min_query_length': 2, 'average_query_length': 44.92, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 629446, 'num_samples': 28831, 'num_queries': 1261, 'num_documents': 27570, 'min_document_length': 83, 'average_document_length': 20.83, 'max_document_length': 3992, 'unique_documents': 27570, 'min_query_length': 2, 'average_query_length': 43.73, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 6791137, 'num_samples': 191821, 'num_queries': 10955, 'num_documents': 180866, 'min_document_length': 77, 'average_document_length': 35.55, 'max_document_length': 7615, 'unique_documents': 180866, 'min_query_length': 2, 'average_query_length': 33.02, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 8619431, 'num_samples': 281739, 'num_queries': 14014, 'num_documents': 267725, 'min_document_length': 94, 'average_document_length': 30.2, 'max_document_length': 4904, 'unique_documents': 267725, 'min_query_length': 2, 'average_query_length': 38.21, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | -| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written, Non-fiction] | None | None | | [CQADupstackAndroidRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-android-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written] | None | None | | [CQADupstackEnglishRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-english-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackGamingRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Non-fiction] | None | None | | [CQADupstackGisRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackMathematicaRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | -| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Non-fiction] | None | None | | [CQADupstackTexRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web, Programming] | None | None | | [CQADupstackUnixRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web] | None | None | | [CQADupstackWebmastersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | | None | None | +| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web, Programming] | None | None | | [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | @@ -141,9 +141,9 @@ The following tables give you an overview of the tasks in MTEB. | [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | None | None | | [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | -| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [ClimateFEVER-Fa](https://huggingface.co/datasets/MCINext/climate-fever-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [ClimateFEVERHardNegatives](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [CmedqaRetrieval](https://aclanthology.org/2022.emnlp-main.357.pdf) | ['cmn'] | Retrieval | s2p | [Medical, Written] | None | None | | [Cmnli](https://huggingface.co/datasets/clue/viewer/cmnli) | ['cmn'] | PairClassification | s2s | | None | None | | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | @@ -206,7 +206,7 @@ The following tables give you an overview of the tasks in MTEB. | [EmotionClassification](https://www.aclweb.org/anthology/D18-1404) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [EstQA](https://www.semanticscholar.org/paper/Extractive-Question-Answering-for-Estonian-Language-182912IAPM-Alum%C3%A4e/ea4f60ab36cadca059c880678bc4c51e293a85d6?utm_source=direct_link) | ['est'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [EstonianValenceClassification](https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054) | ['est'] | Classification | s2s | [News, Written] | None | None | -| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | +| [FEVER](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FEVERHardNegatives](https://fever.ai/) | ['eng'] | Retrieval | s2p | | None | None | | [FQuADRetrieval](https://huggingface.co/datasets/manu/fquad2_test) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [FaithDial](https://mcgill-nlp.github.io/FaithDial) (Dziri et al., 2022) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -216,14 +216,14 @@ The following tables give you an overview of the tasks in MTEB. | [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None | | [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None | | [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | None | None | -| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | | None | None | -| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | | None | None | +| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Written, Financial] | None | None | +| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Written, Financial] | None | None | | [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | | [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written] | None | None | +| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written, Financial] | None | None | | [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | None | None | | [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | @@ -294,7 +294,7 @@ The following tables give you an overview of the tasks in MTEB. | [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | | [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | -| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [News, Written] | None | None | +| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [News, Written, Financial] | None | None | | [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | | [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | None | None | @@ -345,12 +345,12 @@ The following tables give you an overview of the tasks in MTEB. | [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None | | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | -| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | | [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | -| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | -| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | | None | None | +| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | +| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | | [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | @@ -386,7 +386,7 @@ The following tables give you an overview of the tasks in MTEB. | [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | | [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | -| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | | None | None | +| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | | [NQ-Fa](https://huggingface.co/datasets/MCINext/nq-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | @@ -477,7 +477,7 @@ The following tables give you an overview of the tasks in MTEB. | [Query2Query](https://mcinext.com/) | ['fas'] | STS | s2s | | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | -| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | +| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Written, Web, Blog] | None | None | | [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | | [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | @@ -543,7 +543,7 @@ The following tables give you an overview of the tasks in MTEB. | [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | | [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | -| [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | | None | None | +| [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | | [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Web, Spoken, Written] | None | None | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | @@ -573,7 +573,7 @@ The following tables give you an overview of the tasks in MTEB. | [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | | [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | | [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | -| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | | None | None | +| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Written, Blog, Programming] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | | [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | | [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None | @@ -661,8 +661,8 @@ The following tables give you an overview of the tasks in MTEB. | [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [Social, News, Written] | None | None | | [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 59545} | {'test': {'num_samples': 59545, 'number_of_characters': 1907719, 'min_text_length': 11, 'average_text_length': 32.04, 'max_text_length': 120, 'min_labels_per_text': 2082, 'average_labels_per_text': 1.0, 'max_labels_per_text': 3236, 'unique_labels': 20, 'labels': {'12': {'count': 3137}, '6': {'count': 3070}, '0': {'count': 2613}, '2': {'count': 3155}, '10': {'count': 3220}, '17': {'count': 2986}, '14': {'count': 3106}, '13': {'count': 3055}, '1': {'count': 3056}, '16': {'count': 2911}, '9': {'count': 2984}, '3': {'count': 3070}, '15': {'count': 3090}, '7': {'count': 3036}, '5': {'count': 3124}, '11': {'count': 3236}, '18': {'count': 2483}, '8': {'count': 3090}, '19': {'count': 2082}, '4': {'count': 3041}}}} | | [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | None | None | -| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | | None | None | -| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | +| [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | +| [TwitterURLCorpus](https://languagenet.github.io/) | ['eng'] | PairClassification | s2s | [Social, Written] | {'test': 51534} | {'test': {'num_samples': 51534, 'number_of_characters': 8659940, 'min_sentence1_length': 24, 'avg_sentence1_length': 79.49, 'max_sentence1_length': 126, 'unique_sentence1': 4329, 'min_sentence2_length': 6, 'avg_sentence2_length': 88.55, 'max_sentence2_length': 608, 'unique_sentence2': 41304, 'unique_labels': 2, 'labels': {'0': {'count': 38546}, '1': {'count': 12988}}}} | | [UCCVCommonLawLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [UkrFormalityClassification](https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc) | ['ukr'] | Classification | s2s | [News, Written] | None | None | | [UnfairTOSLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | From b78525d05bca45a10a77fe1c9474fd94747a3667 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:09:43 +0000 Subject: [PATCH 063/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4400a96f77..070c17ada6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e07ffe8d4502d99609286e262cc954346f60b427 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:12:09 +0000 Subject: [PATCH 064/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 070c17ada6..58d46c4240 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Web, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 25a6f17fe4807cf1834ec2f2e113ae86f09960fe Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:14:13 +0000 Subject: [PATCH 065/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 58d46c4240..8b6fde61a6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Web, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 913112a3f3474281020e8092ce84d02d6c2a897c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:16:19 +0000 Subject: [PATCH 066/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8b6fde61a6..89021b18ca 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1030888fe108ee286bf4eb062f1ab054c8488d9a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:20:03 +0000 Subject: [PATCH 067/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 89021b18ca..1daf53c361 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b23597c9b7d97b14c6292f1d4e1013b0b91192a6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:22:31 +0000 Subject: [PATCH 068/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1daf53c361..fb1e21fa2a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c34ef6473ca8516d7de7cf59ec3ebcf502ecb69a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:24:41 +0000 Subject: [PATCH 069/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index fb1e21fa2a..2ecdc643a7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Web, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 780a7d3179b96a6b1d8b89f344f09854af889f39 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:26:47 +0000 Subject: [PATCH 070/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2ecdc643a7..b33ce168d3 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Web, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ff4ae8dc4c41acf0aaf80cb0ab0a641d8833fba9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:29:35 +0000 Subject: [PATCH 071/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b33ce168d3..7073e19548 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 599849b47408dfb52f669aa14d39ca70e3673f69 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:31:44 +0000 Subject: [PATCH 072/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7073e19548..d94296e19f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 933f4af9c3b2dde8983f29e78026e3bc36ee0e2a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:34:37 +0000 Subject: [PATCH 073/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index d94296e19f..fb1e21fa2a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 67f8a79f2589380bc690ce69aec372a71bd16bb7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:37:30 +0000 Subject: [PATCH 074/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index fb1e21fa2a..d7707795bd 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1b76261192bc14f2d13d58177bf2495d3d7154e1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:40:10 +0000 Subject: [PATCH 075/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index d7707795bd..9db880dd0f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 51faf65040535ae6475a42d481c8431d73c0afe5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:42:47 +0000 Subject: [PATCH 076/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 9db880dd0f..08a97db6c7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f7438b88745d3846721ddda34f70e45d4d661a70 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:44:55 +0000 Subject: [PATCH 077/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 08a97db6c7..931235fac7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 3123d1cf3681cbd9f8bb99c496c571cde6c0f79d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:47:31 +0000 Subject: [PATCH 078/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 931235fac7..ec5254a162 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0bbc4c701e8655abc95c3da74f1955126cba11d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:50:49 +0000 Subject: [PATCH 079/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ec5254a162..e733279c18 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c46cb8b1df5850e921a3704046b71354cfc80ad6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:53:13 +0000 Subject: [PATCH 080/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e733279c18..b9a9a9e9d7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Academic, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 21b60f578940d29b89c4b60da4cb049ec780d3d9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:55:24 +0000 Subject: [PATCH 081/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b9a9a9e9d7..09e470680f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Academic, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d9ab239460a60f7592f6fc61b58994440accd68b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 21:57:29 +0000 Subject: [PATCH 082/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 09e470680f..ba1747b2fa 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9a6275eb540f6c97e659497bde09d9ee1765a704 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:01:58 +0000 Subject: [PATCH 083/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ba1747b2fa..f2276bd932 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e35c8dd71fe1431d583304ece365a31b5b4dc404 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:04:07 +0000 Subject: [PATCH 084/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index f2276bd932..fb1e21fa2a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d510ddba80fb5396e0b3457aa2d608fc4f1006ea Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:06:12 +0000 Subject: [PATCH 085/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index fb1e21fa2a..bdd0781e3e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4bb4ec645adf0a51d02d12e41b4ab26b19681041 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:08:24 +0000 Subject: [PATCH 086/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bdd0781e3e..83e58eaa78 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9076213f35f0662bc98efe646849d58b910e2b41 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:11:26 +0000 Subject: [PATCH 087/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 83e58eaa78..06e7cb8157 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Web, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 12ad5bd4e0c606f73a2aab5b8e66f11f53fb5d35 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:13:34 +0000 Subject: [PATCH 088/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 06e7cb8157..25cce73c36 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Web, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6df0b8c54c239eaec4e9a67d599dc30d1492e17d Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 30 Jan 2025 22:22:44 +0000 Subject: [PATCH 089/275] 1.31.6 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b291f3f40d..d30dc99cea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.5" +version = "1.31.6" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 490b59cc5eeb82ba0ec2c26959129502c42b141c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:24:56 +0000 Subject: [PATCH 090/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 25cce73c36..29f430f0c5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 93d631f59895a9079da1ba86de965dd6b72bde39 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:27:05 +0000 Subject: [PATCH 091/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 29f430f0c5..06ce14264c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 257578c544d723f915cd546d91b20519741b90d9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:29:11 +0000 Subject: [PATCH 092/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 06ce14264c..2c69e8c89e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1275f932348004542fcecd9daf3b59b19e3c5f2c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:31:19 +0000 Subject: [PATCH 093/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2c69e8c89e..e733279c18 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 47c63c3a555915e1a5cb032866178c351e58db89 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:33:54 +0000 Subject: [PATCH 094/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e733279c18..2985c3c90a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e29348ca49ce31cc8a59e11eb2ae32d04e8ac200 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:36:03 +0000 Subject: [PATCH 095/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2985c3c90a..f2276bd932 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 253a499eb4bef243b715afd9502a28ab9198d763 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:38:42 +0000 Subject: [PATCH 096/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index f2276bd932..f52c285d6e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4d30059eac4a6e7d55e2f5784955108a2e17f2c5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:42:22 +0000 Subject: [PATCH 097/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index f52c285d6e..f27b09998c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0a073dff01b47794ba94088b6eb6d49502bd5c68 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:44:30 +0000 Subject: [PATCH 098/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index f27b09998c..914d367c34 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Non-fiction, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Written, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 58d5248bca4ae7b197bc45ddce6435f9f5a812e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:46:59 +0000 Subject: [PATCH 099/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 914d367c34..25cce73c36 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1faa897a10fbaaede801737f9d1a906edb8ccbbd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:49:03 +0000 Subject: [PATCH 100/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 25cce73c36..7073e19548 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f539d9a92cc47e9aa9b9a28ef3926b4cc6f17bf2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:51:17 +0000 Subject: [PATCH 101/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7073e19548..bf34d92a0a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 803e97374880c0f491956c8852cdd3249a7b712e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:54:11 +0000 Subject: [PATCH 102/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bf34d92a0a..b3b9b86db0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1b1efa77ee4087e51a56f534855ca6a7e1392076 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:56:16 +0000 Subject: [PATCH 103/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b3b9b86db0..2ecdc643a7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Web, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 10d4604e85e60886d0c6ca1b28bd7ede0c020737 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 22:58:40 +0000 Subject: [PATCH 104/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2ecdc643a7..c2b4f20338 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Web, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From baab628fe669afd5d4274c9cab2a231a1eee08be Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:01:22 +0000 Subject: [PATCH 105/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c2b4f20338..ec5254a162 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4a15db1ef23e4a765088a306f7fa7d6e73815fae Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:05:33 +0000 Subject: [PATCH 106/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ec5254a162..685f341c56 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Non-fiction, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2156389e46e4fc4e592a1c35410eefb36f4ee8cf Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:07:42 +0000 Subject: [PATCH 107/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 685f341c56..a29b43ce92 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Non-fiction, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 55cf386ac30745c94650477c75b7730c0af3704b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:10:07 +0000 Subject: [PATCH 108/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a29b43ce92..96c94c121b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 750a9a91bb2887c78fcfb51d2088d2b5e0c287e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:12:13 +0000 Subject: [PATCH 109/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 96c94c121b..15cc6b6fbb 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From acb37786ec6da7eb92d924f97dd2c783b0e4d1ac Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:14:26 +0000 Subject: [PATCH 110/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 15cc6b6fbb..8feb6f77ca 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 616733d4a0862a7e4be5cd4f9946e4bab0e82d25 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:19:39 +0000 Subject: [PATCH 111/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8feb6f77ca..070c17ada6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 3c57df3d7cba844c6875bc16c763ecf4bc287b1d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:22:41 +0000 Subject: [PATCH 112/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 070c17ada6..b3b9b86db0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0f7206b682bc750910c0a8bb1c6291da2dbd5f73 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:24:49 +0000 Subject: [PATCH 113/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b3b9b86db0..08a97db6c7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Written, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From fffce31636ba66f65274f079d9a6188261bdf8b4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:26:55 +0000 Subject: [PATCH 114/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 08a97db6c7..5170152c4d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2d9a472f412c57de978dc9bf1d656003930a7e13 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:29:05 +0000 Subject: [PATCH 115/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5170152c4d..c2b4f20338 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9a2665bf3ec84a7c5c88301c1b50230a7931d661 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:31:12 +0000 Subject: [PATCH 116/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c2b4f20338..21900e7ef2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 28cf2704c9ae3194b70ecfb8ce798dac0cf9dbe5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:33:33 +0000 Subject: [PATCH 117/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 21900e7ef2..3091a18502 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ba68e0780639a67af52b966c0e478ee45fc8329f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:35:42 +0000 Subject: [PATCH 118/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 3091a18502..e17c330d13 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From fb90fa92b2c075d537184048f1bbbff90d888397 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:38:26 +0000 Subject: [PATCH 119/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e17c330d13..96c94c121b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From da7092c9bf671b4b5cace15d613ef2c1c43db056 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:40:47 +0000 Subject: [PATCH 120/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 96c94c121b..bdd0781e3e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b755b796111e71b24f546f552fd4a675667a5be5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:42:49 +0000 Subject: [PATCH 121/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bdd0781e3e..144ca30b3f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1c24ef3fd6db45593c61671966de30e587bdc8a5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:45:18 +0000 Subject: [PATCH 122/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 144ca30b3f..a18e38bdbf 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4cc0fee1e1df3d2713d1085c459fe1e91707a871 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:48:21 +0000 Subject: [PATCH 123/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a18e38bdbf..81b9ad2d00 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2191e83f5d1f755e580d8b16643ed62b3b36495a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:50:51 +0000 Subject: [PATCH 124/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 81b9ad2d00..1b04d7738b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 50f2598d69627c0d30fa2399e2a0603fa06cf650 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:53:17 +0000 Subject: [PATCH 125/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1b04d7738b..b33ce168d3 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ebaa650e988ccafd4da27681da582afd521156ac Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:55:50 +0000 Subject: [PATCH 126/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b33ce168d3..a29b43ce92 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 605f571a44349e90d1e571c2371d4fd828c1a773 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 30 Jan 2025 23:58:14 +0000 Subject: [PATCH 127/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a29b43ce92..96c94c121b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 579f946f788f6cbd76cce30c10eea3088b98aa97 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:00:18 +0000 Subject: [PATCH 128/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 96c94c121b..21900e7ef2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Web, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 44fc1ae4da329fc267473ce82c0e9a027ca2fe29 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:04:18 +0000 Subject: [PATCH 129/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 21900e7ef2..25d73e46b6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4e2167a2b0730d2298f61b3aeb709a8cdf02fef8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:06:42 +0000 Subject: [PATCH 130/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 25d73e46b6..08a97db6c7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From eb80d8bf1041ee9527548235ef7d9a498348c7ab Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:08:45 +0000 Subject: [PATCH 131/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 08a97db6c7..7073e19548 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ef3fe1bc353f0144dc0a1bac37e41aecbd9acead Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:11:08 +0000 Subject: [PATCH 132/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7073e19548..4400a96f77 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 03941bd8c5481c1e0762faae50d43d32c0bd836f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:14:18 +0000 Subject: [PATCH 133/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4400a96f77..e49e1ae2b4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d44d893da6f264f9c8c57bdba5e476aef2a96d60 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:16:23 +0000 Subject: [PATCH 134/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e49e1ae2b4..25d73e46b6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6dd2734c55863e0bb1a46793f8a905db91dbc30e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:20:52 +0000 Subject: [PATCH 135/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 25d73e46b6..7ee266d73e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d4eaa9155231076546a47072d58c152684ab5c6d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:22:56 +0000 Subject: [PATCH 136/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7ee266d73e..2985c3c90a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d6752c0f6a2b6faa676e601d2ec7f55f090fad9a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:25:37 +0000 Subject: [PATCH 137/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2985c3c90a..21900e7ef2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From fd57157c12901ff06bde53a7ae9493473419d98a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:28:00 +0000 Subject: [PATCH 138/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 21900e7ef2..5170152c4d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9661751f96ee2bdfe79c008993c928e3b6e34cc2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:30:14 +0000 Subject: [PATCH 139/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5170152c4d..8b6fde61a6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Non-fiction, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 20a16fb7d1ae8afb28a47cb8d03f609faabde7e8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:32:22 +0000 Subject: [PATCH 140/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8b6fde61a6..e5697988c1 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 3f090265663d387bde206652a176608132302df6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:35:48 +0000 Subject: [PATCH 141/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e5697988c1..1f6ffbc11c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 92d5d17a3c1081a40bd286e896f4e3b9b21159ca Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:38:23 +0000 Subject: [PATCH 142/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1f6ffbc11c..08a97db6c7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b60080d7bfc6e6c3c67fde41bb0c212bb5ef92cd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:40:42 +0000 Subject: [PATCH 143/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 08a97db6c7..21900e7ef2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d2de690273b78ba070c9809b65f0df09dcd5924b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:42:46 +0000 Subject: [PATCH 144/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 21900e7ef2..63ad104742 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From fe6e2cda3c67d8000353f9f8cd015317a613c7b1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:45:18 +0000 Subject: [PATCH 145/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 63ad104742..e84cc18226 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From cdd121a131eaf39004f1dff3dae0428ccb144f06 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:47:23 +0000 Subject: [PATCH 146/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e84cc18226..19d370c273 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2a9fb4b9ddd170e386ab14d5bb3e8769cf86c732 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:50:03 +0000 Subject: [PATCH 147/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 19d370c273..1b04d7738b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9e9f2d1ea870025e9e90161ec426389d31c7b254 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:52:10 +0000 Subject: [PATCH 148/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 1b04d7738b..b7cf75d915 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Programming, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b384daea7ba6c1c4660afc21f101009de4197c43 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:54:35 +0000 Subject: [PATCH 149/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b7cf75d915..8cb4e301cb 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4cd6ad0b18b7b2a3c35e5485eaa612ab717455e9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 00:57:00 +0000 Subject: [PATCH 150/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8cb4e301cb..2c69e8c89e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 74551ca2a7dcc20bad5c2bf73e04ebe282e7be1a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:00:52 +0000 Subject: [PATCH 151/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2c69e8c89e..605fb6e1f5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b3eb993bb88edfab95067fe085dbd7df6ef4c3d8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:03:29 +0000 Subject: [PATCH 152/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 605fb6e1f5..b33ce168d3 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 8866cc2ec8f9cf3b1cd551880ee688423bac4c1e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:05:59 +0000 Subject: [PATCH 153/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b33ce168d3..29f430f0c5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 50bfeaf4cbf96ad17bd90d8138c3730ec228161c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:08:10 +0000 Subject: [PATCH 154/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 29f430f0c5..b56050d087 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d9495200d865bc5e8bf74674b327040093960bdc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:10:16 +0000 Subject: [PATCH 155/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b56050d087..3232c15c48 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From fc940e5450732d2e75a9547860eb47011177e2fc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:12:33 +0000 Subject: [PATCH 156/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 3232c15c48..bceff34030 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Non-fiction, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e8b37f75ad4c04972192045ab9b6cf3ed67cca09 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:14:48 +0000 Subject: [PATCH 157/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bceff34030..29f430f0c5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1d08e42547dcae1fbbe47c4ae0d018bc699426c6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:17:10 +0000 Subject: [PATCH 158/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 29f430f0c5..d7eb9e4fc8 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Programming, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e1926ea0adfcc54e125a42de00e44f598691502a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:19:35 +0000 Subject: [PATCH 159/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index d7eb9e4fc8..fd66e09a65 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Programming, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 5651f6f526d8fca95cc07b12bea97f7a484d95e6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:21:58 +0000 Subject: [PATCH 160/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index fd66e09a65..83e58eaa78 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 03ffb4abc9f8f71aba3bdefedb9baf1908d0fbac Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:24:35 +0000 Subject: [PATCH 161/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 83e58eaa78..ea3c160947 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0ed6c34387a7de9df2354fcd8ee7a34cde75f756 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:26:38 +0000 Subject: [PATCH 162/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ea3c160947..8c3ea4bf87 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Non-fiction, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 11c24527776eead78ec34288fb9ab455631eb2f6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:29:10 +0000 Subject: [PATCH 163/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8c3ea4bf87..daed22cc99 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Non-fiction, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 5f2e277804c8c6e649b2ba32f4768ea116897ce8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:32:07 +0000 Subject: [PATCH 164/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index daed22cc99..6f8970e83a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1791fc840b90c757ebbe297a4f4c249b561e8218 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:34:58 +0000 Subject: [PATCH 165/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 6f8970e83a..a241c951c6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From dea83b16c0938e64359ea2f8011cf9c0ad5bcf95 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:37:23 +0000 Subject: [PATCH 166/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a241c951c6..5e4a9aadae 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Written, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b58a615474abbad48282f63ee0fd2f80f4856037 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:39:33 +0000 Subject: [PATCH 167/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5e4a9aadae..60d3cf5b2c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Written, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Web, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From be4f0dae24f41b6615b42d6e8e3e550a29d5f032 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:41:32 +0000 Subject: [PATCH 168/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 60d3cf5b2c..9c3eecd8b6 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Web, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9e4166f9696462fdb6755d370b0317a4a3665672 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:44:09 +0000 Subject: [PATCH 169/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 9c3eecd8b6..bdd0781e3e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 8aa5a699c6524c3dd5ac431593fdf60f11e89b73 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:46:41 +0000 Subject: [PATCH 170/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bdd0781e3e..e17c330d13 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 813e7117b380311cd54b4e1818e0347fa033263c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:50:28 +0000 Subject: [PATCH 171/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e17c330d13..63ea09bc18 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 51a314ce12fbbcf71268d3dea40c052034da293c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:52:52 +0000 Subject: [PATCH 172/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 63ea09bc18..2c69e8c89e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 8baee527993de867fe020b11bce9e92f7875aa7c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 31 Jan 2025 01:55:15 +0000 Subject: [PATCH 173/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2c69e8c89e..98693b74eb 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 21d32f0b96135fc8f95ce6fd7e513109274a806b Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 1 Feb 2025 12:42:19 +0100 Subject: [PATCH 174/275] fix: remove SummaryRetrieval as a type (#1915) --- mteb/abstasks/TaskMetadata.py | 1 - .../fas/FaMTEBSummaryRetrieval.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index df6a48c1f0..3f115b2dbb 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -97,7 +97,6 @@ "Summarization", "InstructionRetrieval", "Speed", - "SummaryRetrieval", ] diff --git a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py index cf3f9dbe52..c8d36b9082 100644 --- a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py +++ b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py @@ -13,7 +13,7 @@ class SAMSumFa(AbsTaskBitextMining): "path": "MCINext/samsum-fa", "revision": "fd981d78a0ab82c20d2e693a8b3929c5d71b0743", }, - type="SummaryRetrieval", + type="BitextMining", category="s2p", modalities=["text"], eval_splits=["test"], @@ -25,8 +25,8 @@ class SAMSumFa(AbsTaskBitextMining): license="not specified", annotations_creators="LM-generated", dialect=[], - sample_creation="found", - bibtex_citation=""" """, + sample_creation="machine-translated", + bibtex_citation="", ) def dataset_transform(self): @@ -35,16 +35,16 @@ def dataset_transform(self): ) -class SynPerChatbotSumSRetrieval(AbsTaskBitextMining): +class SynPerChatbotSumSBitextMining(AbsTaskBitextMining): metadata = TaskMetadata( name="SynPerChatbotSumSRetrieval", - description="Synthetic Persian Chatbot Summary Dataset", + description="Synthetic Persian Chatbot Summary Dataset", reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval", dataset={ "path": "MCINext/synthetic-persian-chatbot-summary-retrieval", "revision": "9002f5e9de4ef61f1f5c34831d2a5ed855bac0ae", }, - type="SummaryRetrieval", + type="BitextMining", category="p2p", modalities=["text"], eval_splits=["test"], @@ -66,7 +66,7 @@ def dataset_transform(self): ) -class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): +class SynPerChatbotRAGSumSBitextMining(AbsTaskBitextMining): metadata = TaskMetadata( name="SynPerChatbotRAGSumSRetrieval", description="Synthetic Persian Chatbot RAG Summary Dataset", @@ -75,7 +75,7 @@ class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): "path": "MCINext/synthetic-persian-chatbot-rag-summary-retrieval", "revision": "f77746f286bbf2177ee7b5a803da8be440d5d4c1", }, - type="SummaryRetrieval", + type="BitextMining", category="p2p", modalities=["text"], eval_splits=["test"], From eb837f16c4e283eda968961f646b6f2276a7c2b7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:44:29 +0000 Subject: [PATCH 175/275] Update tasks table --- docs/tasks.md | 2116 ++++++++++++++++++++++++------------------------- 1 file changed, 1058 insertions(+), 1058 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 98693b74eb..804b9f8475 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Written, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | @@ -503,7 +503,7 @@ The following tables give you an overview of the tasks in MTEB. | [RuSciBenchGRNTIClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | {'test': 2048} | {'test': {'num_samples': 2048, 'number_of_characters': 1822339, 'min_text_length': 84, 'average_text_length': 889.81, 'max_text_length': 3143, 'min_labels_per_text': 73, 'average_labels_per_text': 1.0, 'max_labels_per_text': 74, 'unique_labels': 28, 'labels': {'3': {'count': 73}, '4': {'count': 73}, '20': {'count': 73}, '9': {'count': 73}, '21': {'count': 73}, '15': {'count': 73}, '16': {'count': 74}, '2': {'count': 73}, '8': {'count': 73}, '23': {'count': 73}, '6': {'count': 73}, '24': {'count': 73}, '10': {'count': 73}, '1': {'count': 73}, '17': {'count': 74}, '14': {'count': 74}, '18': {'count': 73}, '27': {'count': 73}, '19': {'count': 73}, '22': {'count': 73}, '12': {'count': 73}, '25': {'count': 73}, '5': {'count': 74}, '0': {'count': 73}, '26': {'count': 73}, '11': {'count': 73}, '13': {'count': 73}, '7': {'count': 73}}}} | | [RuSciBenchOECDClassification](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Classification | p2p | [Academic, Written] | None | None | | [RuSciBenchOECDClusteringP2P](https://github.com/mlsa-iai-msu-lab/ru_sci_bench/) | ['rus'] | Clustering | p2p | [Academic, Written] | None | None | -| [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | SummaryRetrieval | s2p | [Spoken] | None | None | +| [SAMSumFa](https://huggingface.co/datasets/MCINext/samsum-fa) | ['fas'] | BitextMining | s2p | [Spoken] | None | None | | [SCDBPAccountabilityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPAuditsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDBPCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -600,12 +600,12 @@ The following tables give you an overview of the tasks in MTEB. | [SynPerChatbotConvSAToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotRAGFAQPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Spoken] | None | None | | [SynPerChatbotRAGFAQRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | -| [SynPerChatbotRAGSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval) | ['fas'] | SummaryRetrieval | p2p | [Spoken] | None | None | +| [SynPerChatbotRAGSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval) | ['fas'] | BitextMining | p2p | [Spoken] | None | None | | [SynPerChatbotRAGToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotRAGToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotRAGTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | | [SynPerChatbotSatisfactionLevelClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | -| [SynPerChatbotSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval) | ['fas'] | SummaryRetrieval | p2p | [Spoken] | None | None | +| [SynPerChatbotSumSRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval) | ['fas'] | BitextMining | p2p | [Spoken] | None | None | | [SynPerChatbotToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | @@ -723,1060 +723,1060 @@ The following tables give you an overview of the tasks in MTEB.
-| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | SummaryRetrieval | Sum | -|---|------|------|------|------|------|------|------|------|------|------|------|------|---| -| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 10 | -| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 14 | -| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aom | Ömie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apn | Apinayé | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 0 | 29 | -| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 8 | -| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | -| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 7 | -| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 14 | -| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 28 | -| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkq | Bakairí | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 13 | -| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caa | Chortí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cao | Chácobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| cav | Cavineña | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 17 | -| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| chz | Ozumacín Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| cjo | Ashéninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 0 | 44 | -| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 0 | 37 | -| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpc | Ajyíninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpu | Pichis Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cpy | South Ucayali Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| daa | Dangaléat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 0 | 23 | -| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 0 | 59 | -| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 0 | 16 | -| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 0 | 344 | -| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 8 | -| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | -| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 1 | 28 | 5 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 3 | 90 | -| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 0 | 19 | -| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 0 | 58 | -| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gnw | Western Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gub | Guajajára | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gui | Eastern Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 18 | -| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gun | Mbyá Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gux | Gourmanchéma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gym | Ngäbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 14 | -| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 11 | -| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 39 | -| hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | -| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 12 | -| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 12 | -| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 0 | 21 | -| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 0 | 27 | -| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 13 | -| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 0 | 36 | -| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 19 | -| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 10 | -| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| kbc | Kadiwéu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbh | Camsá | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kbp | Kabiyè | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| kek | Kekchí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgk | Kaiwá | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 8 | -| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kmg | Kâte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 0 | 30 | -| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpj | Karajá | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kyz | Kayabí | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| maj | Jalapa De Díaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 19 | -| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maq | Chiquihuitlán Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 0 | 20 | -| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mav | Sateré-Mawé | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbj | Nadëb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbl | Maxakalí | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcf | Matsés | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mco | Coatlán Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mib | Atatláhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mil | Peñoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | -| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| mop | Mopán Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpm | Yosondúa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxb | Tezoatlán Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myu | Mundurukú | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nab | Southern Nambikuára | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncl | Michoacán Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 0 | 20 | -| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | -| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nob | Norwegian Bokmål | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 19 | -| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 3 | -| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ntu | Natügu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 15 | -| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| otq | Querétaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ots | Estado de México Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pab | Parecís | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pad | Paumarí | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 0 | 18 | -| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| plu | Palikúr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 0 | 46 | -| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 0 | 28 | -| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pri | Paicî | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qub | Huallaga Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qve | Eastern Apurímac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvn | North Junín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvs | San Martín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxh | Panao Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 0 | 18 | -| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 0 | 52 | -| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | -| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 12 | -| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | -| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 0 | 44 | -| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 9 | -| srq | Sirionó | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 9 | -| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 15 | -| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 0 | 22 | -| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 0 | 21 | -| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 24 | -| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 21 | -| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnc | Tanimuca-Retuarã | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| too | Xicotepec De Juárez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | -| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 0 | 20 | -| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| txu | Kayapó | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | -| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 8 | -| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urb | Urubú-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 19 | -| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 4 | -| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | -| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 0 | 18 | -| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 6 | -| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xav | Xavánte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 10 | -| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| xtm | Magdalena Peñasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 16 | -| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yre | Yaouré | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaa | Sierra de Juárez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zac | Ocotlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zam | Miahuatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zar | Rincón Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 0 | 21 | -| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zos | Francisco León Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpl | Lachixío Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpo | Amatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpu | Yalálag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 5 | -| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 7 | -| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1395 | 836 | 311 | 3 | 28 | 91 | 51 | 507 | 88 | 2 | 2 | 3 | +| ISO Code | Language | Family | BitextMining | Classification | Clustering | InstructionRetrieval | MultilabelClassification | PairClassification | Reranking | Retrieval | STS | Speed | Summarization | Sum | +|---|------|------|------|------|------|------|------|------|------|------|------|---| +| aai | Arifama-Miniafia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aak | Ankave | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aau | Abau | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aaz | Amarasi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abs | Ambonese Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abt | Ambulas | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| abx | Inabaknon | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aby | Aneme Wake | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ace | Achinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| acf | Saint Lucian Creole French | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acm | Mesopotamian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| acq | Ta'izzi-Adeni Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| acr | Achi | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| acu | Achuar-Shiwiar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| adz | Adzera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aeb | Tunisian Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aer | Eastern Arrernte | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aey | Amele | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| afr | Afrikaans | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 10 | +| agd | Agarabi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agg | Angor | Senagi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agm | Angaataha | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agn | Agutaynen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agr | Aguaruna | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agt | Central Cagayan Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| agu | Aguacateco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aia | Arosi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aii | Assyrian Neo-Aramaic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ajp | South Levantine Arabic | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| aka | Akan | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ake | Akawaio | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alp | Alune | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| alq | Algonquin | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| als | Tosk Albanian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| aly | Alyawarr | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ame | Yanesha' | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amf | Hamer-Banna | South Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amh | Amharic | Afro-Asiatic | 3 | 6 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | +| amk | Ambai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amm | Ama (Papua New Guinea) | Left May | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amn | Amanab | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amo | Amo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amp | Alamblak | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amr | Amarakaeri | Harakmbut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amu | Guerrero Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| amx | Anmatyerre | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ang | Old English (ca. 450-1100) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anh | Nend | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anp | Angika | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| anv | Denya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoi | Anindilyakwa | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aoj | Mufian | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aom | Ömie | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aon | Bumbita Arapesh | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apb | Sa'a | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apc | Levantine Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| ape | Bukiyip | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apn | Apinayé | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apr | Arop-Lokep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apu | Apurinã | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apw | Western Apache | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| apz | Safeyoka | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ara | Arabic | Unclassified | 2 | 12 | 0 | 0 | 0 | 2 | 2 | 9 | 2 | 0 | 0 | 29 | +| arb | Standard Arabic | Afro-Asiatic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 8 | +| are | Western Arrarnta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arl | Arabela | Zaparoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arn | Mapudungun | Araucanian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arp | Arapaho | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| arq | Algerian Arabic | Afro-Asiatic | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 4 | +| ars | Najdi Arabic | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| ary | Moroccan Arabic | Afro-Asiatic | 1 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 7 | +| arz | Egyptian Arabic | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| asm | Assamese | Indo-European | 5 | 3 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 14 | +| aso | Dano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ast | Asturian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ata | Pele-Ata | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atb | Zaiwa | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atd | Ata Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| atg | Ivbie North-Okpela-Arhe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| att | Pamplona Atta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auc | Waorani | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| aui | Anuki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| auy | Awiyaana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| avt | Au | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awa | Awadhi | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| awb | Awa (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awk | Awabakal | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| awx | Awara | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ayr | Central Aymara | Aymaran | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| azb | South Azerbaijani | Turkic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| aze | Azerbaijani | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| azg | San Pedro Amuzgos Amuzgo | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| azj | North Azerbaijani | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| azz | Highland Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bak | Bashkir | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bam | Bambara | Mande | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| ban | Balinese | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bao | Waimaha | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bba | Baatonum | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbb | Barai | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bbc | Batak Toba | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bbr | Girawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bch | Bariai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bco | Kaluli | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bdd | Bunama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bea | Beaver | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bef | Benabena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bel | Belarusian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| bem | Bemba (Zambia) | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ben | Bengali | Indo-European | 7 | 9 | 2 | 0 | 0 | 1 | 2 | 6 | 1 | 0 | 0 | 28 | +| beo | Beami | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ber | Berber (Other) | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| beu | Blagar | Timor-Alor-Pantar | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bew | Betawi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| bgc | Haryanvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| bgs | Tagabawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bgt | Bughotu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhb | Bhili | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhd | Bhadrawahi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhg | Binandere | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bhl | Bimin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bho | Bhojpuri | Indo-European | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 6 | +| bhp | Bima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| big | Biangai | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjj | Kanauji | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjk | Barok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjn | Banjar | Austronesian | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bjp | Fanamaket | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjr | Binumarien | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjv | Bedjond | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bjz | Baruga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkd | Binukid | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bki | Baki | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkq | Bakairí | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bkx | Baikeno | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blw | Balangao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| blz | Balantak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmh | Kein | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmk | Ghayavi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmr | Muinane | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bmu | Somba-Siawari | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bnp | Bola | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bns | Bundeli | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boa | Bora | Boran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bod | Tibetan | Sino-Tibetan | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| boj | Anjam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bon | Bine | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bos | Bosnian | Indo-European | 3 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| box | Buamu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| boy | Bodo (Central African Republic) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bpr | Koronadal Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bps | Sarangani Blaan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqc | Boko (Benin) | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bqp | Busa | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bra | Braj | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bre | Breton | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| brx | Bodo (India) | Sino-Tibetan | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| bsj | Bangwinji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsn | Barasana-Eduria | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bsp | Baga Sitemu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bss | Akoose | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bug | Buginese | Austronesian | 2 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| buk | Bugawac | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bul | Bulgarian | Indo-European | 3 | 4 | 1 | 0 | 1 | 1 | 1 | 2 | 0 | 0 | 0 | 13 | +| bus | Bokobaru | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvd | Baeggu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bvr | Burarra | Maningrida | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bxh | Buhutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byr | Baruya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| byx | Qaqet | Baining | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzd | Bribri | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzh | Mapos Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| bzj | Belize Kriol English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caa | Chortí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cab | Garifuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cac | Chuj | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| caf | Southern Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cak | Kaqchikel | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cao | Chácobo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cap | Chipaya | Uru-Chipaya | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| car | Galibi Carib | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cat | Catalan | Indo-European | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| cav | Cavineña | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cax | Chiquitano | Chiquitano | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbc | Carapana | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbi | Chachi | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbk | Chavacano | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cbr | Cashibo-Cacataibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbs | Cashinahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbt | Chayahuita | Cahuapanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbu | Candoshi-Shapra | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cbv | Cacua | Kakua-Nukak | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cco | Comaltepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ceb | Cebuano | Austronesian | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| cek | Eastern Khumi Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ces | Czech | Indo-European | 4 | 5 | 2 | 0 | 1 | 2 | 1 | 2 | 0 | 0 | 0 | 17 | +| cgc | Kagayanen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cha | Chamorro | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| chd | Highland Oaxaca Chontal | Tequistlatecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chf | Tabasco Chontal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chk | Chuukese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chq | Quiotepec Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chv | Chuvash | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| chz | Ozumacín Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjk | Chokwe | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| cjo | Ashéninka Pajonal | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cjv | Chuave | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ckb | Central Kurdish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| cle | Lealao Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| clu | Caluyanun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cme | Cerma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cmn | Mandarin Chinese | Sino-Tibetan | 4 | 10 | 4 | 0 | 0 | 3 | 4 | 10 | 9 | 0 | 0 | 44 | +| cmo | Central Mnong | Austroasiatic | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 37 | +| cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cor | Cornish | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cot | Caquinte | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpa | Palantla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpb | Ucayali-Yurúa Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpc | Ajyíninka Apurucayali | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpu | Pichis Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cpy | South Ucayali Ashéninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crh | Crimean Tatar | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| crn | El Nayar Cora | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| crx | Carrier | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csb | Kashubian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cso | Sochiapam Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| csy | Siyin Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cta | Tataltepec Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cth | Thaiphum Chin | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctp | Western Highland Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ctu | Chol | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cub | Cubeo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuc | Usila Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cui | Cuiba | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cuk | San Blas Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cut | Teutila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cux | Tepeuxila Cuicatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cwe | Kwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cya | Nopala Chatino | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| cym | Welsh | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| daa | Dangaléat | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dad | Marik | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dah | Gwahatike | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dan | Danish | Indo-European | 5 | 9 | 2 | 0 | 1 | 0 | 1 | 5 | 0 | 0 | 0 | 23 | +| ded | Dedua | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| deu | German | Indo-European | 6 | 14 | 7 | 0 | 1 | 7 | 2 | 18 | 4 | 0 | 0 | 59 | +| dgc | Casiguran Dumagat Agta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgr | Dogrib | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dgz | Daga | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dhg | Dhangu-Djangu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dif | Dieri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dik | Southwestern Dinka | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| div | Dhivehi | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dji | Djinang | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djk | Eastern Maroon Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| djr | Djambarrpuyngu | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dob | Dobu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| doi | Dogri (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dop | Lukpa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dov | Dombe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dsb | Lower Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dtp | Kadazan Dusun | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwr | Dawro | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dww | Dawawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dwy | Dhuwaya | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dyu | Dyula | Mande | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| dza | Tunzu | Atlantic-Congo | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| dzo | Dzongkha | Sino-Tibetan | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ebk | Eastern Bontok | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eko | Koti | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ell | Modern Greek (1453-) | Indo-European | 3 | 6 | 1 | 0 | 1 | 2 | 0 | 3 | 0 | 0 | 0 | 16 | +| emi | Mussau-Emira | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| emp | Northern Emberá | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eng | English | Indo-European | 17 | 160 | 18 | 3 | 1 | 13 | 8 | 108 | 13 | 2 | 1 | 344 | +| enq | Enga | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| epo | Esperanto | Artificial Language | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| eri | Ogea | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ese | Ese Ejja | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| esk | Northwest Alaska Inupiatun | Eskimo-Aleut | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| est | Estonian | Uralic | 2 | 2 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 8 | +| etr | Edolo | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| eus | Basque | Unclassified | 3 | 2 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| ewe | Ewe | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| faa | Fasu | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fai | Faiwol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fao | Faroese | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 7 | +| far | Fataleka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fas | Persian | Indo-European | 4 | 28 | 5 | 0 | 0 | 8 | 2 | 40 | 3 | 0 | 0 | 90 | +| ffm | Maasina Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fij | Fijian | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| fil | Filipino | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fin | Finnish | Uralic | 3 | 5 | 1 | 0 | 1 | 1 | 2 | 5 | 1 | 0 | 0 | 19 | +| fon | Fon | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| for | Fore | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fra | French | Indo-European | 7 | 13 | 8 | 0 | 1 | 6 | 3 | 15 | 4 | 0 | 1 | 58 | +| fry | Western Frisian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuc | Pulaar | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fue | Borgu Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuf | Pular | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fuh | Western Niger Fulfulde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| fur | Friulian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| fuv | Nigerian Fulfulde | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| gah | Alekano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gai | Borei | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gam | Kandawo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaw | Nobonob | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gaz | West Central Oromo | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| gbm | Garhwali | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| gdn | Umanakaina | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gdr | Wipi | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| geb | Kire | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gfk | Patpatar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ghs | Guhu-Samane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gla | Scottish Gaelic | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| gle | Irish | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glg | Galician | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| glk | Gilaki | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| glv | Manx | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gmv | Gamo | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gng | Ngangam | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnn | Gumatj | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gnw | Western Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gof | Gofa | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gom | Goan Konkani | Indo-European | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| grc | Ancient Greek (to 1453) | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| grn | Guarani | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| gsw | Swiss German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gub | Guajajára | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guh | Guahibo | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gui | Eastern Bolivian Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guj | Gujarati | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | +| gul | Sea Island Creole English | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gum | Guambiano | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gun | Mbyá Guaraní | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| guo | Guayabero | Guahiboan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gup | Gunwinggu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gux | Gourmanchéma | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvc | Guanano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvf | Golin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvn | Kuku-Yalanji | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gvs | Gumawana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gwi | Gwichʼin | Athabaskan-Eyak-Tlingit | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gym | Ngäbere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| gyr | Guarayu | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hat | Haitian | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| hau | Hausa | Afro-Asiatic | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 14 | +| haw | Hawaiian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hbo | Ancient Hebrew | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hch | Huichol | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| heb | Hebrew | Afro-Asiatic | 4 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 11 | +| heg | Helong | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hin | Hindi | Indo-European | 9 | 12 | 2 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 39 | +| hix | Hixkaryána | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hla | Halia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hlt | Matu Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmn | Hmong | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hmo | Hiri Motu | Pidgin | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hne | Chhattisgarhi | Indo-European | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| hns | Caribbean Hindustani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hop | Hopi | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hot | Hote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hrv | Croatian | Indo-European | 4 | 3 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | +| hsb | Upper Sorbian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hto | Minica Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hub | Huambisa | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hui | Huli | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hun | Hungarian | Uralic | 5 | 3 | 1 | 0 | 1 | 0 | 0 | 2 | 0 | 0 | 0 | 12 | +| hus | Huastec | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huu | Murui Huitoto | Huitotoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| huv | San Mateo Del Mar Huave | Huavean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hvn | Sabu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| hye | Armenian | Indo-European | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 9 | +| ian | Iatmul | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ibo | Igbo | Atlantic-Congo | 3 | 5 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 12 | +| ido | Ido | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ign | Ignaciano | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikk | Ika | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ikw | Ikwere | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ile | Interlingue | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ilo | Iloko | Austronesian | 2 | 1 | 2 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| imo | Imbongu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| inb | Inga | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ind | Indonesian | Austronesian | 6 | 7 | 1 | 0 | 0 | 1 | 1 | 4 | 1 | 0 | 0 | 21 | +| ino | Inoke-Yate | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| iou | Tuma-Irumu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ipi | Ipili | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| isl | Icelandic | Indo-European | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | +| isn | Isanzu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ita | Italian | Indo-European | 5 | 9 | 1 | 0 | 1 | 2 | 1 | 5 | 3 | 0 | 0 | 27 | +| iws | Sepik Iwam | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ixl | Ixil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jac | Popti' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jae | Yabem | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jao | Yanyuwa | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jav | Javanese | Austronesian | 4 | 7 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 13 | +| jic | Tol | Jicaquean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jid | Bu (Kaduna State) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jiv | Shuar | Chicham | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jni | Janji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| jpn | Japanese | Japonic | 5 | 8 | 3 | 0 | 0 | 2 | 3 | 13 | 2 | 0 | 0 | 36 | +| jvn | Caribbean Javanese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kab | Kabyle | Afro-Asiatic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kac | Kachin | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| kam | Kamba (Kenya) | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kan | Kannada | Dravidian | 6 | 7 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 19 | +| kaq | Capanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kas | Kashmiri | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| kat | Georgian | Kartvelian | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 10 | +| kaz | Kazakh | Turkic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| kbc | Kadiwéu | Guaicuruan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbh | Camsá | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbm | Iwal | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kbp | Kabiyè | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kbq | Kamano | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdc | Kutu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kde | Makonde | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kdl | Tsikimba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kea | Kabuverdianu | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| kek | Kekchí | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ken | Kenyang | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kew | West Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfg | Kudiya | Dravidian | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kfy | Kumaoni | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgf | Kube | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgk | Kaiwá | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kgp | Kaingang | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khk | Halh Mongolian | Mongolic-Khitan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| khm | Khmer | Austroasiatic | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| khs | Kasua | Bosavi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| khz | Keapara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kik | Kikuyu | Atlantic-Congo | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kin | Kinyarwanda | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 8 | +| kir | Kirghiz | Turkic | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| kiw | Northeast Kiwai | Kiwaian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kiz | Kisi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kje | Kisar | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kjs | East Kewa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkc | Odoodee | East Strickland | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kkl | Kosarek Yale | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klt | Nukna | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| klv | Maskelynes | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmb | Kimbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kmg | Kâte | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmh | Kalam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmk | Limos Kalinga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmo | Kwoma | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmr | Northern Kurdish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| kms | Kamasau | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kmu | Kanite | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knc | Central Kanuri | Saharan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kne | Kankanaey | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knf | Mankanya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knj | Western Kanjobal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| knv | Tabo | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kon | Kongo | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kor | Korean | Koreanic | 4 | 8 | 1 | 0 | 1 | 3 | 1 | 9 | 3 | 0 | 0 | 30 | +| kos | Kosraean | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpf | Komba | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpg | Kapingamarangi | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpj | Karajá | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpr | Korafe-Yegha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpw | Kobon | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kpx | Mountain Koiali | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqa | Mum | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqc | Doromu-Koki | Manubaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqf | Kakabai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kql | Kyenele | Yuat | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kqw | Kandas | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| krc | Karachay-Balkar | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksd | Kuanua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksj | Uare | Kwalean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ksr | Borong | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ktm | Kurti | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kto | Kuot | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kud | 'Auhelawa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kue | Kuman (Papua New Guinea) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kup | Kunimaipa | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kur | Kurdish | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| kvg | Kuni-Boazi | Anim | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kvn | Border Kuna | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwd | Kwaio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwf | Kwara'ae | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwi | Awa-Cuaiquer | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kwj | Kwanga | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyc | Kyaka | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyf | Kouya | Kru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyg | Keyagana | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyq | Kenga | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kyz | Kayabí | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kze | Kosena | Bookkeeping | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| kzj | Coastal Kadazan | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lac | Lacandon | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lao | Lao | Tai-Kadai | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| lat | Latin | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| lav | Latvian | Indo-European | 1 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| lbb | Label | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lbk | Central Bontok | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lcm | Tungag | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| leu | Kara (Papua New Guinea) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lex | Luang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lfn | Lingua Franca Nova | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lgl | Wala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lid | Nyindrou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lif | Limbu | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lij | Ligurian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lim | Limburgan | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lin | Lingala | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| lit | Lithuanian | Indo-European | 4 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| llg | Lole | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| lmo | Lombard | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltg | Latgalian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ltz | Luxembourgish | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| lua | Luba-Lulua | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lug | Ganda | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| luo | Luo (Kenya and Tanzania) | Nilotic | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| lus | Lushai | Sino-Tibetan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| lvs | Standard Latvian | Unclassified | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| lww | Lewo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maa | San Jerónimo Tecóatl Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mad | Madurese | Austronesian | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| mag | Magahi | Indo-European | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mai | Maithili | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| maj | Jalapa De Díaz Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mak | Makasar | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mal | Malayalam | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 19 | +| mam | Mam | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maq | Chiquihuitlán Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mar | Marathi | Indo-European | 7 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 2 | 0 | 0 | 20 | +| mau | Huautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mav | Sateré-Mawé | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| max | North Moluccan Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| maz | Central Mazahua | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbb | Western Bukidnon Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbc | Macushi | Cariban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbh | Mangseng | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbj | Nadëb | Naduhup | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbl | Maxakalí | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbs | Sarangani Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mbt | Matigsalug Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mca | Maca | Mataguayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcb | Machiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcd | Sharanahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcf | Matsés | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mco | Coatlán Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcp | Makaa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcq | Ese | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mcr | Menya | Angan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mdy | Male (Ethiopia) | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| med | Melpa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mee | Mengen | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mek | Mekeo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meq | Merey | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| met | Mato | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| meu | Motu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mey | Hassaniyya | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgc | Morokodo | Central Sudanic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgh | Makhuwa-Meetto | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mgw | Matumbi | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhl | Mauwake | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mhr | Eastern Mari | Uralic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mib | Atatláhuca Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mic | Mi'kmaq | Algic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mie | Ocotepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mig | San Miguel El Grande Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mih | Chayuco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mil | Peñoles Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| min | Minangkabau | Austronesian | 3 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | +| mio | Pinotepa Nacional Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mir | Isthmus Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mit | Southern Puebla Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| miz | Coatzospan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mjc | San Juan Colorado Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkd | Macedonian | Indo-European | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| mkj | Mokilese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkl | Mokole | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mkn | Kupang Malay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mks | Silacayoapan Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mle | Manambu | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlg | Malagasy | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlh | Mape | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlp | Bargam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mlt | Maltese | Afro-Asiatic | 2 | 2 | 2 | 0 | 2 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | +| mmo | Mangga Buang | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mmx | Madak | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mna | Mbula | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mni | Manipuri | Sino-Tibetan | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| mon | Mongolian | Unclassified | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| mop | Mopán Maya | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mos | Mossi | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mox | Molima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mph | Maung | Iwaidjan Proper | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpj | Martu Wangka | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpm | Yosondúa Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpp | Migabac | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mps | Dadibi | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpt | Mian | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mpx | Misima-Panaeati | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqb | Mbuko | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mqj | Mamasa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mri | Maori | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| msa | Malay (macrolanguage) | Unclassified | 1 | 2 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 4 | +| msb | Masbatenyo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msc | Sankaran Maninka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msk | Mansaka | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msm | Agusan Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| msy | Aruamu | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mti | Maiwa (Papua New Guinea) | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mto | Totontepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mui | Musi | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| mup | Malvi | Indo-European | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| mux | Bo-Ung | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| muy | Muyang | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mva | Manam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mvn | Minaveha | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwc | Are | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwe | Mwera (Chimwera) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwf | Murrinh-Patha | Southern Daly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwp | Kala Lagaw Ya | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mwr | Marwari | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxb | Tezoatlán Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxp | Tlahuitoltepec Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxq | Juquila Mixe | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mxt | Jamiltepec Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mya | Burmese | Sino-Tibetan | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | +| myk | Mamara Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myu | Mundurukú | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myw | Muyuw | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| myy | Macuna | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| mzz | Maiadomu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nab | Southern Nambikuára | Nambiquaran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| naf | Nabak | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nak | Nakanai | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nas | Naasioi | South Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbl | South Ndebele | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nbq | Nggem | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nca | Iyo | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nch | Central Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncj | Northern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncl | Michoacán Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ncu | Chumburung | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nde | North Ndebele | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndg | Ndengereko | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ndj | Ndamba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nds | Low German | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nep | Nepali (macrolanguage) | Unclassified | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nfa | Dhao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngp | Ngulu | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ngu | Guerrero Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhe | Eastern Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhg | Tetelcingo Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhi | Zacatlán-Ahuacatlán-Tepetzintla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nho | Takuu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhr | Naro | Khoe-Kwadi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhu | Noone | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhw | Western Huasteca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nhy | Northern Oaxaca Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nif | Nek | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nii | Nii | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nij | Ngaju | Austronesian | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nin | Ninzo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nko | Nkonya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nld | Dutch | Indo-European | 6 | 6 | 1 | 0 | 1 | 1 | 1 | 2 | 2 | 0 | 0 | 20 | +| nlg | Gela | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nna | Nyangumarta | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nno | Norwegian Nynorsk | Unclassified | 4 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | +| nnq | Ngindo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| noa | Woun Meu | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nob | Norwegian Bokmål | Unclassified | 4 | 7 | 5 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 19 | +| noe | Nimadi | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nop | Numanggang | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nor | Norwegian | Indo-European | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 3 | +| not | Nomatsiguenga | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nou | Ewage-Notu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nov | Novial | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| npi | Nepali (individual language) | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| npl | Southeastern Puebla Nahuatl | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nqo | N'Ko | Artificial Language | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| nsn | Nehan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nso | Pedi | Atlantic-Congo | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| nss | Nali | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntj | Ngaanyatjarra | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntp | Northern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ntu | Natügu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nus | Nuer | Nilotic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| nuy | Nunggubuyu | Gunwinyguan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nvm | Namiae | Koiarian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nwi | Southwest Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nya | Nyanja | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| nys | Nyungar | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| nyu | Nyungwe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| obo | Obo Manobo | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| oci | Occitan (post 1500) | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| okv | Orokaiva | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| omw | South Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ong | Olo | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ons | Ono | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ood | Tohono O'odham | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| opm | Oksapmin | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ori | Oriya (macrolanguage) | Unclassified | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| orm | Oromo | Unclassified | 1 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| orv | Old Russian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ory | Odia | Indo-European | 5 | 4 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 15 | +| ote | Mezquital Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otm | Eastern Highland Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otn | Tenango Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| otq | Querétaro Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ots | Estado de México Otomi | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pab | Parecís | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pad | Paumarí | Arawan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pag | Pangasinan | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pah | Tenharim | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pam | Pampanga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pan | Panjabi | Indo-European | 6 | 6 | 2 | 0 | 0 | 1 | 0 | 2 | 1 | 0 | 0 | 18 | +| pao | Northern Paiute | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pap | Papiamento | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| pbt | Southern Pashto | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| pcm | Nigerian Pidgin | Indo-European | 1 | 4 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| pes | Iranian Persian | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| pib | Yine | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pio | Piapoco | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pir | Piratapuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| piu | Pintupi-Luritja | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pjt | Pitjantjatjara | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pls | San Marcos Tlacoyalco Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| plt | Plateau Malagasy | Austronesian | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| plu | Palikúr | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pma | Paama | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pms | Piemontese | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poe | San Juan Atzingo Popoloca | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poh | Poqomchi' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| poi | Highland Popoluca | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pol | Polish | Indo-European | 4 | 11 | 4 | 0 | 1 | 4 | 0 | 18 | 4 | 0 | 0 | 46 | +| pon | Pohnpeian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| por | Portuguese | Indo-European | 4 | 9 | 1 | 0 | 2 | 3 | 1 | 5 | 3 | 0 | 0 | 28 | +| poy | Pogolo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ppo | Folopa | Teberan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prf | Paranan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pri | Paicî | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| prs | Dari | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| ptp | Patep | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ptu | Bambam | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| pus | Pushto | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| pwg | Gapapaiwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qub | Huallaga Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quc | K'iche' | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quf | Lambayeque Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quh | South Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qul | North Bolivian Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qup | Southern Pastaza Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| quy | Ayacucho Quechua | Quechuan | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| qvc | Cajamarca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qve | Eastern Apurímac Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvh | Huamalíes-Dos de Mayo Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvm | Margos-Yarowilca-Lauricocha Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvn | North Junín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvs | San Martín Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvw | Huaylla Wanca Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qvz | Northern Pastaza Quichua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qwh | Huaylas Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxh | Panao Huánuco Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxn | Northern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| qxo | Southern Conchucos Ancash Quechua | Quechuan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rai | Ramoaaina | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| raj | Rajasthani | Unclassified | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| reg | Kara (Tanzania) | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rej | Rejang | Austronesian | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| rgu | Ringgou | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rkb | Rikbaktsa | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmc | Carpathian Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rmy | Vlax Romani | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rom | Romany | Unclassified | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| ron | Romanian | Indo-European | 5 | 6 | 1 | 0 | 1 | 0 | 1 | 3 | 1 | 0 | 0 | 18 | +| roo | Rotokas | North Bougainville | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rop | Kriol | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| row | Dela-Oenale | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rro | Waima | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ruf | Luguru | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| rug | Roviana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| run | Rundi | Atlantic-Congo | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| rus | Russian | Indo-European | 5 | 13 | 6 | 0 | 2 | 4 | 2 | 16 | 4 | 0 | 0 | 52 | +| rwo | Rawa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sab | Buglere | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sag | Sango | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sah | Yakut | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| san | Sanskrit | Indo-European | 5 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 10 | +| sat | Santali | Austroasiatic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| sbe | Saliba | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbk | Safwa | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sbs | Subiya | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| scn | Sicilian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sco | Scots | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| seh | Sena | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sey | Secoya | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgb | Mag-antsi Ayta | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sgz | Sursurunga | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shi | Tachelhit | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shj | Shatt | Dajuic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| shn | Shan | Tai-Kadai | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| shp | Shipibo-Conibo | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sim | Mende (Papua New Guinea) | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sin | Sinhala | Indo-European | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| sja | Epena | Chocoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slk | Slovak | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 12 | +| sll | Salt-Yui | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| slv | Slovenian | Indo-European | 3 | 4 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | +| smk | Bolinao | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| smo | Samoan | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| sna | Shona | Atlantic-Congo | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| snc | Sinaugoro | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snd | Sindhi | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| snn | Siona | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snp | Siane | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| snx | Sam | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sny | Saniyo-Hiyewe | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| som | Somali | Afro-Asiatic | 3 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | +| soq | Kanasi | Dagan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sot | Southern Sotho | Atlantic-Congo | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| soy | Miyobe | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spa | Spanish | Indo-European | 4 | 13 | 4 | 0 | 1 | 3 | 2 | 13 | 4 | 0 | 0 | 44 | +| spl | Selepet | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spm | Akukem | Ramu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spp | Supyire Senoufo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sps | Saposa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| spy | Sabaot | Nilotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sqi | Albanian | Unclassified | 2 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| srd | Sardinian | Unclassified | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| sri | Siriano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srm | Saramaccan | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| srn | Sranan Tongo | Indo-European | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| srp | Serbian | Indo-European | 4 | 1 | 1 | 0 | 0 | 0 | 1 | 2 | 0 | 0 | 0 | 9 | +| srq | Sirionó | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssd | Siroi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssg | Seimat | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ssw | Swati | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| ssx | Samberigi | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| stp | Southeastern Tepehuan | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sua | Sulka | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sue | Suena | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sun | Sundanese | Austronesian | 3 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 9 | +| sus | Susu | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| suz | Sunwar | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| svk | Slovakian Sign Language | Sign Language | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 1 | 7 | 2 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 15 | +| swe | Swedish | Indo-European | 4 | 8 | 3 | 0 | 1 | 1 | 1 | 4 | 0 | 0 | 0 | 22 | +| swg | Swabian | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| swh | Swahili (individual language) | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| swp | Suau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| sxb | Suba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| szl | Silesian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tac | Lowland Tarahumara | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tah | Tahitian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taj | Eastern Tamang | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tam | Tamil | Dravidian | 7 | 7 | 2 | 0 | 0 | 1 | 0 | 3 | 1 | 0 | 0 | 21 | +| taq | Tamasheq | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tat | Tatar | Turkic | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| tav | Tatuyo | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| taw | Tai | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbc | Takia | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbf | Mandara | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbg | North Tairora | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbo | Tawala | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tbz | Ditammari | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tca | Ticuna | Ticuna-Yuri | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcs | Torres Strait Creole | Indo-European | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tcz | Thado Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tdt | Tetun Dili | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tee | Huehuetla Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tel | Telugu | Dravidian | 7 | 7 | 2 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 24 | +| ter | Tereno | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tet | Tetum | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tew | Tewa (USA) | Kiowa-Tanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tfr | Teribe | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgk | Tajik | Indo-European | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| tgl | Tagalog | Austronesian | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| tgo | Sudest | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tgp | Tangoa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tha | Thai | Tai-Kadai | 4 | 8 | 1 | 0 | 0 | 1 | 1 | 6 | 0 | 0 | 0 | 21 | +| tif | Tifal | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tim | Timbe | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tir | Tigrinya | Afro-Asiatic | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| tiw | Tiwi | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tiy | Tiruray | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tke | Takwane | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tku | Upper Necaxa Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tlf | Telefol | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tmd | Haruai | Piawi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tna | Tacana | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnc | Tanimuca-Retuarã | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnk | Kwamera | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnn | North Tanna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tnp | Whitesands | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toc | Coyutla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tod | Toma | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tof | Gizrra | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| toj | Tojolabal | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ton | Tonga (Tonga Islands) | Austronesian | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| too | Xicotepec De Juárez Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| top | Papantla Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tos | Highland Totonac | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpa | Taupota | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpi | Tok Pisin | Indo-European | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | +| tpt | Tlachichilco Tepehua | Totonacan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tpz | Tinputz | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| trc | Copala Triqui | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tsn | Tswana | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| tso | Tsonga | Atlantic-Congo | 1 | 4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| tsw | Tsishingini | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ttc | Tektiteko | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tte | Bwanabwana | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuc | Mutu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tue | Tuyuca | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuf | Central Tunebo | Chibchan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tuk | Turkmen | Turkic | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | +| tum | Tumbuka | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tuo | Tucano | Tucanoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tur | Turkish | Turkic | 4 | 7 | 1 | 0 | 0 | 3 | 0 | 3 | 2 | 0 | 0 | 20 | +| tvk | Southeast Ambrym | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| twi | Twi | Unclassified | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| txq | Tii | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| txu | Kayapó | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tyv | Tuvinian | Turkic | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzj | Tz'utujil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzl | Talossan | Artificial Language | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| tzm | Central Atlas Tamazight | Afro-Asiatic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| tzo | Tzotzil | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubr | Ubir | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ubu | Umbu-Ungu | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| udu | Uduk | Koman | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uig | Uighur | Turkic | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | +| ukr | Ukrainian | Indo-European | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 8 | +| uli | Ulithian | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ulk | Meriam Mir | Eastern Trans-Fly | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| umb | Umbundu | Atlantic-Congo | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| upv | Uripiv-Wala-Rano-Atchin | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ura | Urarina | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urb | Urubú-Kaapor | Tupian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urd | Urdu | Indo-European | 7 | 8 | 2 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 19 | +| uri | Urim | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urt | Urat | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| urw | Sop | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usa | Usarufa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| usp | Uspanteco | Mayan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvh | Uri | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uvl | Lote | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| uzb | Uzbek | Unclassified | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| uzn | Northern Uzbek | Turkic | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 4 | +| vec | Venetian | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| ven | Venda | Atlantic-Congo | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | +| vid | Vidunda | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vie | Vietnamese | Austroasiatic | 5 | 6 | 1 | 0 | 0 | 1 | 0 | 5 | 0 | 0 | 0 | 18 | +| viv | Iduna | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| vmy | Ayautla Mazatec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| waj | Waffa | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wal | Wolaytta | Ta-Ne-Omotic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wap | Wapishana | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| war | Waray (Philippines) | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| wat | Kaninuwa | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbi | Vwanji | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wbp | Warlpiri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wed | Wedau | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wer | Weri | Kunimaipan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wim | Wik-Mungkan | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiu | Wiru | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wiv | Vitu | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wln | Walloon | Indo-European | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmt | Walmajarri | Pama-Nyungan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wmw | Mwani | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnc | Wantoat | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wnu | Usan | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wol | Wolof | Atlantic-Congo | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 6 | +| wos | Hanga Hundi | Ndu | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrk | Garrwa | Garrwan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wro | Worrorra | Worrorran | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wrs | Waris | Border | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wsk | Waskia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuu | Wu Chinese | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| wuv | Wuvulu-Aua | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xav | Xavánte | Nuclear-Macro-Je | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xbi | Kombio | Nuclear Torricelli | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xed | Hdi | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xho | Xhosa | Atlantic-Congo | 3 | 3 | 3 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 10 | +| xla | Kamula | Kamula-Elevala | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xnn | Northern Kankanay | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xon | Konkomba | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xsi | Sio | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtd | Diuxi-Tilantongo Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| xtm | Magdalena Peñasco Mixtec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaa | Yaminahua | Pano-Tacanan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yad | Yagua | Peba-Yagua | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yal | Yalunka | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yap | Yapese | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yaq | Yaqui | Uto-Aztecan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yby | Yaweyuha | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ycn | Yucuna | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ydd | Eastern Yiddish | Indo-European | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | +| yid | Yiddish | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yka | Yakan | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yle | Yele | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yml | Iamalele | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yon | Yongkom | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yor | Yoruba | Atlantic-Congo | 4 | 5 | 3 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 16 | +| yrb | Yareba | Yareban | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yre | Yaouré | Mande | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yss | Yessan-Mayo | Sepik | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yue | Yue Chinese | Sino-Tibetan | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | +| yuj | Karkar-Yuri | Pauwasi | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yut | Yopno | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yuw | Yau (Morobe Province) | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| yva | Yawa | Yawa-Saweru | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaa | Sierra de Juárez Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zab | Western Tlacolula Valley Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zac | Ocotlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zad | Cajonos Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zai | Isthmus Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaj | Zaramo | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zam | Miahuatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zao | Ozolotepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zap | Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zar | Rincón Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zas | Santo Domingo Albarradas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zat | Tabaa Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zav | Yatzachi Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zaw | Mitla Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zca | Coatecas Altas Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zga | Kinga | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zho | Chinese | Unclassified | 2 | 2 | 1 | 0 | 0 | 2 | 1 | 13 | 0 | 0 | 0 | 21 | +| zia | Zia | Nuclear Trans New Guinea | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ziw | Zigula | Atlantic-Congo | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zlm | Malay (individual language) | Austronesian | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zos | Francisco León Zoque | Mixe-Zoque | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpc | Choapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpl | Lachixío Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpm | Mixtepec Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpo | Amatlán Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpq | Zoogocho Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpu | Yalálag Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpv | Chichicapan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zpz | Texmelucan Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zsm | Standard Malay | Austronesian | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 5 | +| zsr | Southern Rincon Zapotec | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| ztq | Quioquitani-Quierí Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | +| zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | +| Total | None | None | None | 1398 | 836 | 311 | 3 | 28 | 91 | 51 | 507 | 88 | 2 | 2 |
From 3c86eeef173e5b0a3361c58ab03f75ea1806b9f1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:46:30 +0000 Subject: [PATCH 176/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 804b9f8475..7578570e30 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Written, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 07a02c54c66275905201437c1e7a4780305e837e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:48:36 +0000 Subject: [PATCH 177/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7578570e30..a93affcd54 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 8ffa6cf8ddb105711e38ed6106c1120d6d6f5188 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:50:43 +0000 Subject: [PATCH 178/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a93affcd54..bda6f15db4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4e8288d5a6080ab7571e97a3978dae88fc65aa19 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:53:10 +0000 Subject: [PATCH 179/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bda6f15db4..95c1c34975 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4977c93085fa210f4de6f26f05f63ca154d90fc0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:55:46 +0000 Subject: [PATCH 180/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 95c1c34975..a8a03630d4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Written, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 33ce26af98314500859908898632303eac309f1f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 11:57:51 +0000 Subject: [PATCH 181/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a8a03630d4..5167e83a7c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Web, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From da378de49241d5436e22c4ad16a551cc18333fa4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:00:25 +0000 Subject: [PATCH 182/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5167e83a7c..4cd4df9be4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Web, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1a60580daec655ee7ae4fb5e66de92a40b16cb52 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:03:26 +0000 Subject: [PATCH 183/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4cd4df9be4..0dcf6dd4db 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 7184a29a5d1af2b4e9f05cc0b030141fb2e59629 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:07:21 +0000 Subject: [PATCH 184/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0dcf6dd4db..e64f3dfbc2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ec2cf139b492a9023d765cc6ee2051942035a8e2 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:11:10 +0000 Subject: [PATCH 185/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e64f3dfbc2..de712a3548 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Non-fiction, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e4329f0f424bebe768031440f18a747a2d377e55 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:14:16 +0000 Subject: [PATCH 186/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index de712a3548..87e5a97b15 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Non-fiction, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 42bea66a5560bd6499a32209e45c3d98a909b400 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:16:47 +0000 Subject: [PATCH 187/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 87e5a97b15..a0e2790fb1 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4b88d1c568bf58d73b2610ff21209437bbc7e001 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:18:50 +0000 Subject: [PATCH 188/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a0e2790fb1..bc07d6b5ff 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6c0070a4e420fcaf0f38b5674ae0029d3e9ec992 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:20:53 +0000 Subject: [PATCH 189/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bc07d6b5ff..4cd4df9be4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2cb0c3a19334d6f75531ed829338168b6325a7ec Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:23:23 +0000 Subject: [PATCH 190/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4cd4df9be4..74a0f24fc0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 7258174c40a45a0f47be297579ad4c1ac0111d1d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:25:33 +0000 Subject: [PATCH 191/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 74a0f24fc0..a8428cdd76 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Academic, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d2e1361ff8fcfce7a2fa1285f57a3aac0fc14a08 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:27:43 +0000 Subject: [PATCH 192/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a8428cdd76..025c47f846 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Non-fiction, Web, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 32317369a9579952089a44e5c9e876bfebf9390e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:31:40 +0000 Subject: [PATCH 193/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 025c47f846..3d4743caa5 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2b4a467125ed86c169749f8546e812fc46a56706 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:33:41 +0000 Subject: [PATCH 194/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 3d4743caa5..7578570e30 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 37ef436f20bbbbe9b0d3f1fb2d6f662a886eb387 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:35:41 +0000 Subject: [PATCH 195/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7578570e30..998022b2a7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Written, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Written, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From a6c284163e5ce4f30276a9f0ed0248d56d3126f9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:37:44 +0000 Subject: [PATCH 196/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 998022b2a7..08ddaf4d1a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Written, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f70a994b3839fe276d880e76fd25494367dfadd7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:40:36 +0000 Subject: [PATCH 197/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 08ddaf4d1a..7cc2d3be05 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 635ed802dcd46817d5d9c5cdbaff048d25268452 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:43:03 +0000 Subject: [PATCH 198/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7cc2d3be05..0dcf6dd4db 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 75ff333d60f1e93dcb645dbccbdc868dc5bb9420 Mon Sep 17 00:00:00 2001 From: Isaac Chung Date: Sat, 1 Feb 2025 21:44:30 +0900 Subject: [PATCH 199/275] fix: revert rename and add to description (#1918) --- .../SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py index c8d36b9082..f0797068c3 100644 --- a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py +++ b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py @@ -7,7 +7,7 @@ class SAMSumFa(AbsTaskBitextMining): metadata = TaskMetadata( name="SAMSumFa", - description="Translated Version of SAMSum Dataset", + description="Translated Version of SAMSum Dataset for summary retrieval.", reference="https://huggingface.co/datasets/MCINext/samsum-fa", dataset={ "path": "MCINext/samsum-fa", @@ -35,10 +35,10 @@ def dataset_transform(self): ) -class SynPerChatbotSumSBitextMining(AbsTaskBitextMining): +class SynPerChatbotSumSRetrieval(AbsTaskBitextMining): metadata = TaskMetadata( name="SynPerChatbotSumSRetrieval", - description="Synthetic Persian Chatbot Summary Dataset", + description="Synthetic Persian Chatbot Summary Dataset for summary retrieval.", reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval", dataset={ "path": "MCINext/synthetic-persian-chatbot-summary-retrieval", @@ -66,10 +66,10 @@ def dataset_transform(self): ) -class SynPerChatbotRAGSumSBitextMining(AbsTaskBitextMining): +class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): metadata = TaskMetadata( name="SynPerChatbotRAGSumSRetrieval", - description="Synthetic Persian Chatbot RAG Summary Dataset", + description="Synthetic Persian Chatbot RAG Summary Dataset for summary retrieval.", reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval", dataset={ "path": "MCINext/synthetic-persian-chatbot-rag-summary-retrieval", From d9c9b9e157f3ec0dd14f41973bc22f0d49343217 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:47:51 +0000 Subject: [PATCH 200/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0dcf6dd4db..a0e2790fb1 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 03b23806b6de0d5c915e64c086cb9749cd2445b1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:50:03 +0000 Subject: [PATCH 201/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a0e2790fb1..baf8e2b5f9 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 88a2fe1ee4259b411e1b3aeab52acb3c90e97c3a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:54:11 +0000 Subject: [PATCH 202/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index baf8e2b5f9..9d38d9be2e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Web, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 1f7971f98e02cfae01d67d9764b04d6b93c3754c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:56:36 +0000 Subject: [PATCH 203/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 9d38d9be2e..b19d0a23c9 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Web, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ad1deffef122123c8681736eb5b01c6f977d16ef Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 12:58:42 +0000 Subject: [PATCH 204/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b19d0a23c9..e6fe2ab10c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Non-fiction, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 96413197195503997517a0ea10197f57ae9822da Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:00:57 +0000 Subject: [PATCH 205/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e6fe2ab10c..b55ef6ce24 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Non-fiction, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d9ba6813da9309d20c6e1043d36d8a188cf8e79e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:03:34 +0000 Subject: [PATCH 206/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index b55ef6ce24..a93affcd54 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 96f3aff6f86c09012601ef01d68f9c0888b17d8e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:06:01 +0000 Subject: [PATCH 207/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index a93affcd54..855dbd7a3c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2e34cc72224206fb0a318cca70e22817aa279e18 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:08:08 +0000 Subject: [PATCH 208/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 855dbd7a3c..c701f1b739 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Programming, Non-fiction, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 5c2cbfc621341032ba8fbc2c93a070817c66f1fa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:10:35 +0000 Subject: [PATCH 209/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c701f1b739..e64f3dfbc2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From b61de5d54b71136e945baa0e69c1210b3fb1106b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:13:06 +0000 Subject: [PATCH 210/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index e64f3dfbc2..6b83f50d95 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Academic, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 26ffe3aabfc6ff2fa25fdfaa972ce0056105dd65 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:15:34 +0000 Subject: [PATCH 211/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 6b83f50d95..0badb61a8f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Non-fiction, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ff4e7c62d59fb92d61ed3baa3b7ce7527cb0c6e4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:17:39 +0000 Subject: [PATCH 212/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0badb61a8f..0dcf6dd4db 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Non-fiction, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 13fd52eea8103f7c2677f2f29b8bdba617092e11 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:20:57 +0000 Subject: [PATCH 213/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0dcf6dd4db..06993005b2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Academic, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2850833c3823d5a2c1b83e56ac4c2fc16ede1a04 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:23:24 +0000 Subject: [PATCH 214/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 06993005b2..4855aaf284 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Programming, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d57f988fcc854b2623a04abde64688a80488ec35 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:25:45 +0000 Subject: [PATCH 215/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4855aaf284..981c77827d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Web, Written, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c01563d461e8195c71af421cd4cb151776291dbc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:28:45 +0000 Subject: [PATCH 216/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 981c77827d..80f66663fe 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Non-fiction, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From a9be71659f5f050bf1dde5e0b23897474721d82e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:30:46 +0000 Subject: [PATCH 217/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 80f66663fe..6660edc2ff 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Non-fiction, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Web, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c3ea285750c9a6744e16d0ca4ef7aade3875e880 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:33:49 +0000 Subject: [PATCH 218/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 6660edc2ff..5979f2618c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Programming, Web, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e823bd768de8857a2e42d396ee0949c14cb2d0fa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:35:51 +0000 Subject: [PATCH 219/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5979f2618c..bc07d6b5ff 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2756d67e98ef413b640f7e0380fce611f53625d0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:38:20 +0000 Subject: [PATCH 220/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bc07d6b5ff..3ff7131367 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6d051da9a498689f7c3f0eb9be8b92e5abc22924 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:40:26 +0000 Subject: [PATCH 221/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 3ff7131367..dc07f03a10 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6cb089f84f99db873b7476883187f066ce56999a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:42:45 +0000 Subject: [PATCH 222/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index dc07f03a10..519b4724c1 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Written, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f1ea61a7374434ded6cf06fb377a05efab1249b6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:44:55 +0000 Subject: [PATCH 223/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 519b4724c1..ae454c0b60 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Written, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Web, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 54d1bd1574779977e55f0b9ba51f3a047b443713 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:48:07 +0000 Subject: [PATCH 224/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ae454c0b60..954e2c7a8b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Web, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 887ebf27f859c31f431bb314343e2922258df5b9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:50:33 +0000 Subject: [PATCH 225/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 954e2c7a8b..8faa83ee51 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 471ea4ced5a13f78e9a2c0949441dcf51654cd36 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:53:16 +0000 Subject: [PATCH 226/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8faa83ee51..4cd4df9be4 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c72a4baf2ebe33e2db6035894224a73dd0862c68 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:55:18 +0000 Subject: [PATCH 227/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 4cd4df9be4..5979f2618c 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Written, Academic, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Written, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e5ae84f974b6eb38386680afb46e461526709007 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:57:44 +0000 Subject: [PATCH 228/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5979f2618c..c8d92754b0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Written, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Programming, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From d6ff9d0b8496dcfdd82486fa5d18361f0cc6e4c9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 13:59:46 +0000 Subject: [PATCH 229/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c8d92754b0..c701f1b739 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Written, Programming, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 745e2e6edddfcc8a838aff3c5b1f00608c75c56e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:01:56 +0000 Subject: [PATCH 230/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c701f1b739..954e2c7a8b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From dafbb8088cc043e8bc6eba7f6bd355a0d649d151 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:04:12 +0000 Subject: [PATCH 231/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 954e2c7a8b..025c47f846 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From ea6c1a2b81ff15612d5e25ff1983d67886640df0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:08:29 +0000 Subject: [PATCH 232/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 025c47f846..ceaff491bd 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Academic, Programming, Non-fiction, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Web, Written, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 53f4e2e53f9f3df5d749b1e3fff125004762577c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:11:34 +0000 Subject: [PATCH 233/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index ceaff491bd..5075903f81 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Web, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From bf3256a58a5061ee3bbd429fe12147d7f50f04de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:13:39 +0000 Subject: [PATCH 234/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 5075903f81..2a51276828 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Programming, Written, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Programming, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0e8a539a191394af6f0379c438757a1c775ca0ea Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:15:43 +0000 Subject: [PATCH 235/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2a51276828..98b3b70737 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Programming, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Web, Non-fiction, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 52c000d638ab70b1b1b03ae0b7e14d7813c67a46 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:17:48 +0000 Subject: [PATCH 236/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 98b3b70737..80b4849817 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Web, Non-fiction, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0ac5bf223559f7fd1441bdd224010d9aa4dfa827 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:22:09 +0000 Subject: [PATCH 237/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 80b4849817..3bb3457e71 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Academic, Programming, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Written, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From f42d5d024b4f291d49ccf0cba5106df061642c7a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:24:25 +0000 Subject: [PATCH 238/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 3bb3457e71..0396ba1f6a 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Written, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From df3ef70de53677d00ad76383273bc56ade3b6a1d Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:26:58 +0000 Subject: [PATCH 239/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0396ba1f6a..dd54bbf86f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Academic, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e183458d17b031adc4887a1f60e0e9b8c10708a5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:28:59 +0000 Subject: [PATCH 240/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index dd54bbf86f..bc07d6b5ff 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Programming, Academic, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2b5f320ebf8a36d643e42dd03c18a2f829a608a3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:31:25 +0000 Subject: [PATCH 241/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index bc07d6b5ff..981c77827d 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Programming, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 996c5228d03b8086e733ed10afd92102b4982010 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:33:31 +0000 Subject: [PATCH 242/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 981c77827d..0f811dd0c7 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Programming, Academic, Web] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Academic, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From de3a1f9bdc5ebe9e89d1e477f508144d037d8f0e Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:35:55 +0000 Subject: [PATCH 243/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 0f811dd0c7..9ef900ac38 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Written, Academic, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0cd396e3505130b4a10afdd083723b1222746e47 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:38:39 +0000 Subject: [PATCH 244/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 9ef900ac38..7325d745d2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Non-fiction, Web, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 974ff3ca6b73e7ad34dde05fe9e49d223998f029 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:40:45 +0000 Subject: [PATCH 245/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7325d745d2..8492a98538 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Non-fiction, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 0ae0417933655cb81917f46ea6a78e1b05dc86fc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:43:17 +0000 Subject: [PATCH 246/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8492a98538..729131838f 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Academic, Written, Non-fiction, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Web, Written, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From c275b10b693d2d3c84d0808a7db81156e51cc182 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:45:20 +0000 Subject: [PATCH 247/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 729131838f..14ca1feaa3 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Web, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Written, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 429438973aea73505a2282a2d1a2f823e6ad31d7 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:48:06 +0000 Subject: [PATCH 248/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 14ca1feaa3..8faa83ee51 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Web, Non-fiction, Written, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 8cdb25a54b548699d0ddc4583ff22186d461892b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:50:09 +0000 Subject: [PATCH 249/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 8faa83ee51..954e2c7a8b 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Web, Non-fiction, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 9146cc3ad979e5d9b9bf22e7b9b08e0a0f7b3e61 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:52:39 +0000 Subject: [PATCH 250/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 954e2c7a8b..37b95c1cf3 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Academic, Written, Web, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 7474c97fed493abc12aa83f29dac21c996e57c85 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:55:02 +0000 Subject: [PATCH 251/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 37b95c1cf3..2a51276828 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Web, Programming, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Programming, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 4be535282293b25d028cd491201e6a14bca9d361 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 14:57:03 +0000 Subject: [PATCH 252/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2a51276828..c701f1b739 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Programming, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From a420249ad20ff69cde6705ee3dd483e9b9ea57f3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:04:34 +0000 Subject: [PATCH 253/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index c701f1b739..7325d745d2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Programming, Academic, Written, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Programming, Non-fiction] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 597b8fceaca83e8481018954a197905af8893b61 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:06:52 +0000 Subject: [PATCH 254/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 7325d745d2..9c1cab9a81 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Written, Academic, Programming, Non-fiction] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Non-fiction, Academic] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e344a2ebd4db22e278a3f7db81347ebfa31d1544 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:09:37 +0000 Subject: [PATCH 255/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 9c1cab9a81..24e0a2bc38 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Programming, Web, Written, Non-fiction, Academic] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 2b95d66bc1ba4012aa097f94ecaefa74f4663f4f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:12:03 +0000 Subject: [PATCH 256/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 24e0a2bc38..6b496dc43e 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6072eaeb8aa011edcf435ae7c80e2165210d37d6 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:15:31 +0000 Subject: [PATCH 257/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 6b496dc43e..24e0a2bc38 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Non-fiction, Written, Academic, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Web, Programming] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From e932dfc3096374117a20c0a03ec7da06eaa9f745 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:17:48 +0000 Subject: [PATCH 258/275] Update tasks table --- docs/tasks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tasks.md b/docs/tasks.md index 24e0a2bc38..2aa102be00 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -83,7 +83,7 @@ The following tables give you an overview of the tasks in MTEB. | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Written, Non-fiction, Academic, Web, Programming] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | | [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | From 6f673ba0350a73c3b0bd39a22c704b36640ef1ff Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 1 Feb 2025 16:18:57 +0100 Subject: [PATCH 259/275] docs: Add sort to domains for task metadata (#1922) Tests currently go into an infinite loop. This should prevent that. --- docs/create_tasks_table.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py index 4a1be0cd89..33dca958cb 100644 --- a/docs/create_tasks_table.py +++ b/docs/create_tasks_table.py @@ -50,7 +50,9 @@ def task_to_markdown_row(task: mteb.AbsTask) -> str: f"[{name}]({task.metadata.reference})" if task.metadata.reference else name ) domains = ( - "[" + ", ".join(task.metadata.domains) + "]" if task.metadata.domains else "" + "[" + ", ".join(sorted(task.metadata.domains)) + "]" + if task.metadata.domains + else "" ) n_samples = task.metadata.n_samples dataset_statistics = round_floats_in_dict(task.metadata.descriptive_stats) From 14616dc2a8fcad80ce0394806223a9ddd54457e8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:22:45 +0000 Subject: [PATCH 260/275] Update tasks table --- docs/tasks.md | 174 +++++++++++++++++++++++++------------------------- 1 file changed, 87 insertions(+), 87 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index 2aa102be00..d6e5cc9bd1 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -18,7 +18,7 @@ The following tables give you an overview of the tasks in MTEB. | [AllegroReviews](https://aclanthology.org/2020.acl-main.111.pdf) | ['pol'] | Classification | s2s | | None | None | | [AlloProfClusteringP2P.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | p2p | [Encyclopaedic, Written] | None | None | | [AlloProfClusteringS2S.v2](https://huggingface.co/datasets/lyon-nlp/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Clustering | s2s | [Encyclopaedic, Written] | None | None | -| [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Web, Academic, Written] | None | None | +| [AlloprofReranking](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Reranking | s2p | [Academic, Web, Written] | None | None | | [AlloprofRetrieval](https://huggingface.co/datasets/antoinelb7/alloprof) (Lefebvre-Brossard et al., 2023) | ['fra'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [AlphaNLI](https://leaderboard.allenai.org/anli/submissions/get-started) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [AmazonCounterfactualClassification](https://arxiv.org/abs/2104.06893) | ['deu', 'eng', 'jpn'] | Classification | s2s | [Reviews, Written] | None | None | @@ -37,13 +37,13 @@ The following tables give you an overview of the tasks in MTEB. | [AskUbuntuDupQuestions](https://github.com/taolei87/askubuntu) | ['eng'] | Reranking | s2s | [Programming, Web] | {'test': 375} | {'test': {'num_samples': 375, 'number_of_characters': 413674, 'num_positive': 2255, 'num_negative': 5245, 'min_query_length': 17, 'avg_query_length': 50.21, 'max_query_length': 148, 'unique_query': 374, 'min_positive_length': 15, 'avg_positive_length': 52.54, 'max_positive_length': 152, 'unique_positive': 2165, 'min_negative_length': 15, 'avg_negative_length': 52.69, 'max_negative_length': 148, 'unique_negative': 5002}} | | [Assin2RTE](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | PairClassification | s2s | [Written] | None | None | | [Assin2STS](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) (Real et al., 2020) | ['por'] | STS | s2s | [Written] | None | None | -| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Government, Medical, Legal, Social, Financial] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | +| [AutoRAGRetrieval](https://arxiv.org/abs/2410.20878) (Dongkyu Kim, 2024) | ['kor'] | Retrieval | s2p | [Financial, Government, Legal, Medical, Social] | {'test': 834} | {'test': {'number_of_characters': 894.22, 'num_samples': 834, 'num_queries': 114, 'num_documents': 720, 'average_document_length': 1.15, 'average_query_length': 0.61, 'average_relevant_docs_per_query': 1.0}} | | [BIOSSES](https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html) (Soğancıoğlu et al., 2017) | ['eng'] | STS | s2s | [Medical] | None | None | | [BQ](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [BSARDRetrieval](https://huggingface.co/datasets/maastrichtlawtech/bsard) (Louis et al., 2022) | ['fra'] | Retrieval | s2p | [Legal, Spoken] | None | None | | [BUCC.v2](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) | ['cmn', 'deu', 'eng', 'fra', 'rus'] | BitextMining | s2s | [Written] | {'test': 35000} | {'test': {'num_samples': 35000, 'number_of_characters': 6640032, 'unique_pairs': 34978, 'min_sentence1_length': 16, 'average_sentence1_length': 99.11, 'max_sentence1_length': 204, 'unique_sentence1': 34978, 'min_sentence2_length': 42, 'average_sentence2_length': 90.61, 'max_sentence2_length': 159, 'unique_sentence2': 25306, 'hf_subset_descriptive_stats': {'de-en': {'num_samples': 9580, 'number_of_characters': 1919197, 'unique_pairs': 9573, 'min_sentence1_length': 50, 'average_sentence1_length': 109.08, 'max_sentence1_length': 204, 'unique_sentence1': 9573, 'min_sentence2_length': 46, 'average_sentence2_length': 91.25, 'max_sentence2_length': 155, 'unique_sentence2': 9570}, 'fr-en': {'num_samples': 9086, 'number_of_characters': 1677545, 'unique_pairs': 9081, 'min_sentence1_length': 43, 'average_sentence1_length': 99.32, 'max_sentence1_length': 174, 'unique_sentence1': 9081, 'min_sentence2_length': 42, 'average_sentence2_length': 85.31, 'max_sentence2_length': 159, 'unique_sentence2': 9076}, 'ru-en': {'num_samples': 14435, 'number_of_characters': 2808206, 'unique_pairs': 14425, 'min_sentence1_length': 40, 'average_sentence1_length': 101.66, 'max_sentence1_length': 186, 'unique_sentence1': 14425, 'min_sentence2_length': 45, 'average_sentence2_length': 92.88, 'max_sentence2_length': 159, 'unique_sentence2': 14424}, 'zh-en': {'num_samples': 1899, 'number_of_characters': 235084, 'unique_pairs': 1899, 'min_sentence1_length': 16, 'average_sentence1_length': 28.43, 'max_sentence1_length': 40, 'unique_sentence1': 1899, 'min_sentence2_length': 48, 'average_sentence2_length': 95.36, 'max_sentence2_length': 159, 'unique_sentence2': 1899}}}} | | [Banking77Classification](https://arxiv.org/abs/2003.04807) | ['eng'] | Classification | s2s | [Written] | None | None | -| [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [Web, News, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | +| [BelebeleRetrieval](https://arxiv.org/abs/2308.16884) (Lucas Bandarkar, 2023) | ['acm', 'afr', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'azj', 'bam', 'ben', 'bod', 'bul', 'cat', 'ceb', 'ces', 'ckb', 'dan', 'deu', 'ell', 'eng', 'est', 'eus', 'fin', 'fra', 'fuv', 'gaz', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kac', 'kan', 'kat', 'kaz', 'kea', 'khk', 'khm', 'kin', 'kir', 'kor', 'lao', 'lin', 'lit', 'lug', 'luo', 'lvs', 'mal', 'mar', 'mkd', 'mlt', 'mri', 'mya', 'nld', 'nob', 'npi', 'nso', 'nya', 'ory', 'pan', 'pbt', 'pes', 'plt', 'pol', 'por', 'ron', 'rus', 'shn', 'sin', 'slk', 'slv', 'sna', 'snd', 'som', 'sot', 'spa', 'srp', 'ssw', 'sun', 'swe', 'swh', 'tam', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tsn', 'tso', 'tur', 'ukr', 'urd', 'uzn', 'vie', 'war', 'wol', 'xho', 'yor', 'zho', 'zsm', 'zul'] | Retrieval | s2p | [News, Web, Written] | {'test': 521866} | {'test': {'number_of_characters': 25574620, 'num_samples': 521866, 'num_queries': 338378, 'num_documents': 183488, 'min_document_length': 4, 'average_document_length': 137.38, 'max_document_length': 237, 'unique_documents': 183488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 338378, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 183488, 'hf_subset_descriptive_stats': {'acm_Arab-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'acm_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-acm_Arab': {'number_of_characters': 51232, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 102.98, 'max_document_length': 129, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'afr_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-afr_Latn': {'number_of_characters': 71217, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 143.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'als_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-als_Latn': {'number_of_characters': 69498, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 140.41, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'amh_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-amh_Ethi': {'number_of_characters': 45221, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 90.67, 'max_document_length': 100, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'apc_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-apc_Arab': {'number_of_characters': 51248, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 103.02, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ars_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ars_Arab': {'number_of_characters': 51765, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 104.08, 'max_document_length': 119, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ary_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ary_Arab': {'number_of_characters': 60261, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 121.49, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arz_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-arz_Arab': {'number_of_characters': 52403, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 105.38, 'max_document_length': 115, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'asm_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-asm_Beng': {'number_of_characters': 62410, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 4, 'average_document_length': 125.89, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'azj_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-azj_Latn': {'number_of_characters': 67137, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.58, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bam_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bam_Latn': {'number_of_characters': 66084, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 133.42, 'max_document_length': 166, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bod_Tibt-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bod_Tibt': {'number_of_characters': 79188, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.27, 'max_document_length': 213, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'bul_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-bul_Cyrl': {'number_of_characters': 66577, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.43, 'max_document_length': 177, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'cat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-cat_Latn': {'number_of_characters': 68842, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.07, 'max_document_length': 163, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ceb_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ceb_Latn': {'number_of_characters': 74053, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 149.75, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ces_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ces_Latn': {'number_of_characters': 61936, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 124.92, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ckb_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ckb_Arab': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 131.03, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'dan_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-dan_Latn': {'number_of_characters': 66648, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 134.57, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'deu_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-deu_Latn': {'number_of_characters': 68768, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 138.92, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ell_Grek-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ell_Grek': {'number_of_characters': 79210, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 160.32, 'max_document_length': 212, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'est_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-est_Latn': {'number_of_characters': 61779, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.6, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eus_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-eus_Latn': {'number_of_characters': 67979, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 137.3, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fin_Latn': {'number_of_characters': 66234, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fra_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fra_Latn': {'number_of_characters': 82464, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 166.98, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'fuv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-fuv_Latn': {'number_of_characters': 53555, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 107.74, 'max_document_length': 122, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'gaz_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-gaz_Latn': {'number_of_characters': 78315, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 158.48, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'grn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-grn_Latn': {'number_of_characters': 68572, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 138.52, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'guj_Gujr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-guj_Gujr': {'number_of_characters': 57007, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 114.82, 'max_document_length': 138, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hat_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hat_Latn': {'number_of_characters': 64558, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.29, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hau_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hau_Latn': {'number_of_characters': 78240, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.33, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'heb_Hebr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-heb_Hebr': {'number_of_characters': 50598, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 101.68, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hrv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hrv_Latn': {'number_of_characters': 62928, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.95, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hun_Latn': {'number_of_characters': 67941, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 137.22, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hye_Armn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-hye_Armn': {'number_of_characters': 68859, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 139.1, 'max_document_length': 193, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ibo_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ibo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ibo_Latn': {'number_of_characters': 66167, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 133.59, 'max_document_length': 156, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ilo_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ilo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ilo_Latn': {'number_of_characters': 78161, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 158.17, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ind_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ind_Latn': {'number_of_characters': 74871, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 151.42, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'isl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-isl_Latn': {'number_of_characters': 70522, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 142.51, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ita_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ita_Latn': {'number_of_characters': 76124, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 153.99, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jav_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jav_Latn': {'number_of_characters': 71722, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 144.97, 'max_document_length': 174, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'jpn_Jpan-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-jpn_Jpan': {'number_of_characters': 33187, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 66.01, 'max_document_length': 76, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kac_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kac_Latn': {'number_of_characters': 89655, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 181.72, 'max_document_length': 195, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kan_Knda-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kan_Knda': {'number_of_characters': 65899, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.04, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kat_Geor-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kat_Geor': {'number_of_characters': 68309, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.98, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kaz_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kaz_Cyrl': {'number_of_characters': 64657, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 130.49, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kea_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kea_Latn': {'number_of_characters': 69323, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.06, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khk_Cyrl': {'number_of_characters': 66977, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 135.25, 'max_document_length': 162, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'khm_Khmr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-khm_Khmr': {'number_of_characters': 69150, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 139.7, 'max_document_length': 169, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kin_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kin_Latn': {'number_of_characters': 72803, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 147.19, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'kir_Cyrl-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kir_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kir_Cyrl': {'number_of_characters': 67957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 137.26, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'kor_Hang-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-kor_Hang': {'number_of_characters': 32708, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 65.02, 'max_document_length': 88, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lao_Laoo-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lao_Laoo': {'number_of_characters': 57958, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 116.77, 'max_document_length': 142, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lin_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lin_Latn': {'number_of_characters': 74223, 'num_samples': 1386, 'num_queries': 898, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 150.1, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 898, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lit_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lit_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lit_Latn': {'number_of_characters': 62805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 126.7, 'max_document_length': 167, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lug_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'lug_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lug_Latn': {'number_of_characters': 71566, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 144.65, 'max_document_length': 237, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'luo_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'luo_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-luo_Latn': {'number_of_characters': 66805, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 134.9, 'max_document_length': 178, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'lvs_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-lvs_Latn': {'number_of_characters': 63957, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 129.06, 'max_document_length': 172, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mal_Mlym-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mal_Mlym': {'number_of_characters': 73599, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.82, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mar_Deva-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mar_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mar_Deva': {'number_of_characters': 62671, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 126.42, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'mkd_Cyrl-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mkd_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mkd_Cyrl': {'number_of_characters': 67588, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 136.5, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mlt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mlt_Latn': {'number_of_characters': 68480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 138.33, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mri_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mri_Latn': {'number_of_characters': 74519, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 150.7, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'mya_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-mya_Mymr': {'number_of_characters': 81331, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 164.66, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nld_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nld_Latn': {'number_of_characters': 68789, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 138.96, 'max_document_length': 183, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nob_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nob_Latn': {'number_of_characters': 64917, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 131.03, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nso_Latn': {'number_of_characters': 79073, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 160.03, 'max_document_length': 235, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'nya_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-nya_Latn': {'number_of_characters': 82685, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.44, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ory_Orya-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ory_Orya': {'number_of_characters': 66638, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 10, 'average_document_length': 134.55, 'max_document_length': 168, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pan_Guru-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pan_Guru': {'number_of_characters': 66944, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 135.18, 'max_document_length': 157, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pbt_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pbt_Arab': {'number_of_characters': 61880, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 124.8, 'max_document_length': 155, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pes_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pes_Arab': {'number_of_characters': 59252, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 119.42, 'max_document_length': 152, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'plt_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-plt_Latn': {'number_of_characters': 86472, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 175.2, 'max_document_length': 222, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'pol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-pol_Latn': {'number_of_characters': 67664, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 136.66, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'por_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-por_Latn': {'number_of_characters': 71281, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.07, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ron_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ron_Latn': {'number_of_characters': 71844, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 145.22, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'rus_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-rus_Cyrl': {'number_of_characters': 75823, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 153.38, 'max_document_length': 196, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'shn_Mymr-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-shn_Mymr': {'number_of_characters': 69288, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 139.98, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slk_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slk_Latn': {'number_of_characters': 62663, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 126.41, 'max_document_length': 146, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'slv_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-slv_Latn': {'number_of_characters': 62895, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 126.88, 'max_document_length': 176, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sna_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sna_Latn': {'number_of_characters': 74071, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.78, 'max_document_length': 191, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'snd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-snd_Arab': {'number_of_characters': 58057, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 116.97, 'max_document_length': 164, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'som_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-som_Latn': {'number_of_characters': 82838, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 167.75, 'max_document_length': 201, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sot_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sot_Latn': {'number_of_characters': 75794, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 153.32, 'max_document_length': 186, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'spa_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-spa_Latn': {'number_of_characters': 74920, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 151.52, 'max_document_length': 180, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'srp_Cyrl-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'srp_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-srp_Cyrl': {'number_of_characters': 61657, 'num_samples': 1387, 'num_queries': 899, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 124.35, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.09, 'max_query_length': 2, 'unique_queries': 899, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 2, 'unique_relevant_docs': 488}, 'ssw_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ssw_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ssw_Latn': {'number_of_characters': 73964, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 149.57, 'max_document_length': 182, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sun_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-sun_Latn': {'number_of_characters': 71320, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 144.15, 'max_document_length': 173, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swe_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swe_Latn': {'number_of_characters': 62785, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 126.66, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'swh_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-swh_Latn': {'number_of_characters': 73480, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 148.57, 'max_document_length': 194, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tam_Taml-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tam_Taml': {'number_of_characters': 73991, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 149.62, 'max_document_length': 181, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tel_Telu-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tel_Telu': {'number_of_characters': 65945, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 133.13, 'max_document_length': 149, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgk_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgk_Cyrl': {'number_of_characters': 67829, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 136.99, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tgl_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tgl_Latn': {'number_of_characters': 75087, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 151.87, 'max_document_length': 184, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tha_Thai-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tha_Thai': {'number_of_characters': 54496, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 109.67, 'max_document_length': 123, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tir_Ethi-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tir_Ethi': {'number_of_characters': 47775, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 95.9, 'max_document_length': 110, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tsn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tsn_Latn': {'number_of_characters': 79391, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 160.69, 'max_document_length': 204, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tso_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tso_Latn': {'number_of_characters': 83501, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 169.11, 'max_document_length': 215, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'tur_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-tur_Latn': {'number_of_characters': 65382, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 131.98, 'max_document_length': 158, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ukr_Cyrl-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-ukr_Cyrl': {'number_of_characters': 65850, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 132.94, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'uzn_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-uzn_Latn': {'number_of_characters': 70828, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 143.14, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'vie_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-vie_Latn': {'number_of_characters': 66724, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 134.73, 'max_document_length': 161, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'war_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-war_Latn': {'number_of_characters': 78444, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 158.75, 'max_document_length': 207, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'wol_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-wol_Latn': {'number_of_characters': 64521, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 130.22, 'max_document_length': 139, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'xho_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-xho_Latn': {'number_of_characters': 71629, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 16, 'average_document_length': 144.78, 'max_document_length': 179, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'yor_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-yor_Latn': {'number_of_characters': 62752, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 126.59, 'max_document_length': 143, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hans-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hans': {'number_of_characters': 20549, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 40.11, 'max_document_length': 64, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zho_Hant-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zho_Hant': {'number_of_characters': 19947, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 7, 'average_document_length': 38.88, 'max_document_length': 45, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zsm_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zsm_Latn': {'number_of_characters': 72008, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 13, 'average_document_length': 145.56, 'max_document_length': 210, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'zul_Latn-eng_Latn': {'number_of_characters': 70589, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 142.65, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'eng_Latn-zul_Latn': {'number_of_characters': 69413, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 140.24, 'max_document_length': 171, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Arab-arb_Latn': {'number_of_characters': 61298, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 12, 'average_document_length': 123.61, 'max_document_length': 160, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'arb_Latn-arb_Arab': {'number_of_characters': 53671, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 107.98, 'max_document_length': 134, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Beng-ben_Latn': {'number_of_characters': 68285, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 137.93, 'max_document_length': 185, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'ben_Latn-ben_Beng': {'number_of_characters': 63512, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 9, 'average_document_length': 128.15, 'max_document_length': 175, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Deva-hin_Latn': {'number_of_characters': 68307, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 137.97, 'max_document_length': 170, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'hin_Latn-hin_Deva': {'number_of_characters': 66332, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 14, 'average_document_length': 133.93, 'max_document_length': 165, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Deva-npi_Latn': {'number_of_characters': 65683, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 20, 'average_document_length': 132.6, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'npi_Latn-npi_Deva': {'number_of_characters': 61183, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 18, 'average_document_length': 123.38, 'max_document_length': 154, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Sinh-sin_Latn': {'number_of_characters': 85996, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 19, 'average_document_length': 174.22, 'max_document_length': 224, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'sin_Latn-sin_Sinh': {'number_of_characters': 63902, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 17, 'average_document_length': 128.95, 'max_document_length': 159, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Arab-urd_Latn': {'number_of_characters': 82039, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 15, 'average_document_length': 166.11, 'max_document_length': 230, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}, 'urd_Latn-urd_Arab': {'number_of_characters': 64450, 'num_samples': 1388, 'num_queries': 900, 'num_documents': 488, 'min_document_length': 11, 'average_document_length': 130.07, 'max_document_length': 187, 'unique_documents': 488, 'min_query_length': 2, 'average_query_length': 1.08, 'max_query_length': 2, 'unique_queries': 900, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 488}}}} | | [BengaliDocumentClassification](https://aclanthology.org/2023.eacl-main.4) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliHateSpeechClassification](https://huggingface.co/datasets/bn_hate_speech) (Karim et al., 2020) | ['ben'] | Classification | s2s | [News, Written] | None | None | | [BengaliSentimentAnalysis](https://data.mendeley.com/datasets/p6zc7krs37/4) (Sazzed et al., 2020) | ['ben'] | Classification | s2s | [Reviews, Written] | None | None | @@ -54,14 +54,14 @@ The following tables give you an overview of the tasks in MTEB. | [BiorxivClusteringS2S.v2](https://api.biorxiv.org/) | ['eng'] | Clustering | s2s | [Academic, Written] | None | None | | [BlurbsClusteringP2P.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | p2p | [Fiction, Written] | None | None | | [BlurbsClusteringS2S.v2](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html) (Steffen Remus, 2019) | ['deu'] | Clustering | s2s | [Fiction, Written] | None | None | -| [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Web, Social, Fiction, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | +| [BornholmBitextMining](https://aclanthology.org/W19-6138/) | ['dan'] | BitextMining | s2s | [Fiction, Social, Web, Written] | {'test': 500} | {'test': {'num_samples': 500, 'number_of_characters': 44361, 'unique_pairs': 500, 'min_sentence1_length': 1, 'average_sentence1_length': 49.83, 'max_sentence1_length': 555, 'unique_sentence1': 497, 'min_sentence2_length': 5, 'average_sentence2_length': 38.89, 'max_sentence2_length': 453, 'unique_sentence2': 491}} | | [BrazilianToxicTweetsClassification](https://paperswithcode.com/dataset/told-br) (Joao Augusto Leite and Diego F. Silva and Kalina Bontcheva and Carolina Scarton, 2020) | ['por'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [BrightRetrieval](https://huggingface.co/datasets/xlangai/BRIGHT) (Hongjin Su, 2024) | ['eng'] | Retrieval | s2p | [Non-fiction] | None | None | | [BulgarianStoreReviewSentimentClassfication](https://doi.org/10.7910/DVN/TXIK9P) (Georgieva-Trifonova et al., 2018) | ['bul'] | Classification | s2s | [Reviews, Written] | None | None | -| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | +| [CBD](http://2019.poleval.pl/files/poleval2019.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | | [CDSC-E](https://aclanthology.org/P17-1073.pdf) | ['pol'] | PairClassification | s2s | [Written] | None | None | | [CDSC-R](https://aclanthology.org/P17-1073.pdf) | ['pol'] | STS | s2s | [Web, Written] | None | None | -| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Blog, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | +| [CEDRClassification](https://www.sciencedirect.com/science/article/pii/S1877050921013247) (Sboev et al., 2021) | ['rus'] | MultilabelClassification | s2s | [Blog, Social, Web, Written] | {'test': 1882, 'train': 7528} | {'test': {'num_samples': 1882, 'number_of_characters': 171649, 'number_texts_in_train': 7, 'min_text_length': 6, 'average_text_length': 91.21, 'max_text_length': 220, 'unique_texts': 1875, 'min_labels_per_text': 0, 'average_label_per_text': 0.62, 'max_labels_per_text': 2, 'unique_labels': 6, 'labels': {'None': {'count': 734}, '3': {'count': 141}, '2': {'count': 170}, '1': {'count': 379}, '0': {'count': 353}, '4': {'count': 125}}}, 'train': {'num_samples': 7528, 'number_of_characters': 697322, 'number_texts_in_train': None, 'min_text_length': 5, 'average_text_length': 92.63, 'max_text_length': 280, 'unique_texts': 7500, 'min_labels_per_text': 0, 'average_label_per_text': 0.61, 'max_labels_per_text': 3, 'unique_labels': 6, 'labels': {'None': {'count': 3043}, '2': {'count': 607}, '0': {'count': 1569}, '3': {'count': 589}, '1': {'count': 1417}, '4': {'count': 411}}}} | | [CExaPPC](https://github.com/exaco/exappc) | ['fas'] | PairClassification | s2s | [Social, Web] | None | None | | [CLSClusteringP2P.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | p2p | [Academic, Written] | None | None | | [CLSClusteringS2S.v2](https://arxiv.org/abs/2209.05034) (Yudong Li, 2022) | ['cmn'] | Clustering | s2s | [Academic, Written] | None | None | @@ -69,31 +69,31 @@ The following tables give you an overview of the tasks in MTEB. | [CMedQAv2-reranking](https://github.com/zhangsheng93/cMedQA2) (S. Zhang, 2018) | ['cmn'] | Reranking | s2s | [Medical, Written] | None | None | | [COIRCodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1056326} | {'test': {'number_of_characters': 36843313, 'num_samples': 1056326, 'num_queries': 52561, 'num_documents': 1003765, 'min_document_length': 54, 'average_document_length': 34.71, 'max_document_length': 334374, 'unique_documents': 1003765, 'min_query_length': 2, 'average_query_length': 38.19, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 14574651, 'num_samples': 295228, 'num_queries': 14918, 'num_documents': 280310, 'min_document_length': 95, 'average_document_length': 49.99, 'max_document_length': 14008, 'unique_documents': 280310, 'min_query_length': 2, 'average_query_length': 37.58, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 2587540, 'num_samples': 68145, 'num_queries': 3291, 'num_documents': 64854, 'min_document_length': 87, 'average_document_length': 37.9, 'max_document_length': 334374, 'unique_documents': 64854, 'min_query_length': 2, 'average_query_length': 39.41, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 3641108, 'num_samples': 190562, 'num_queries': 8122, 'num_documents': 182440, 'min_document_length': 54, 'average_document_length': 17.96, 'max_document_length': 5280, 'unique_documents': 182440, 'min_query_length': 2, 'average_query_length': 44.92, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 629446, 'num_samples': 28831, 'num_queries': 1261, 'num_documents': 27570, 'min_document_length': 83, 'average_document_length': 20.83, 'max_document_length': 3992, 'unique_documents': 27570, 'min_query_length': 2, 'average_query_length': 43.73, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 6791137, 'num_samples': 191821, 'num_queries': 10955, 'num_documents': 180866, 'min_document_length': 77, 'average_document_length': 35.55, 'max_document_length': 7615, 'unique_documents': 180866, 'min_query_length': 2, 'average_query_length': 33.02, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 8619431, 'num_samples': 281739, 'num_queries': 14014, 'num_documents': 267725, 'min_document_length': 94, 'average_document_length': 30.2, 'max_document_length': 4904, 'unique_documents': 267725, 'min_query_length': 2, 'average_query_length': 38.21, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CPUSpeedTask](https://github.com/KennethEnevoldsen/scandinavian-embedding-benchmark/blob/c8376f967d1294419be1d3eb41217d04cd3a65d3/src/seb/registered_tasks/speed.py#L83-L96) | ['eng'] | Speed | s2s | [Fiction, Written] | None | None | -| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written, Non-fiction] | None | None | +| [CQADupstackAndroidRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Web, Written] | None | None | | [CQADupstackAndroidRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-android-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackEnglishRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written] | None | None | | [CQADupstackEnglishRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-english-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CQADupstackGamingRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackGamingRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Non-fiction] | None | None | +| [CQADupstackGisRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackGisRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | +| [CQADupstackMathematicaRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackMathematicaRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | +| [CQADupstackPhysicsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackPhysicsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Written, Non-fiction] | None | None | +| [CQADupstackProgrammersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Programming, Written] | None | None | | [CQADupstackProgrammersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Web, Non-fiction, Academic, Programming, Written] | None | None | +| [CQADupstackRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | None | [Academic, Non-fiction, Programming, Web, Written] | None | None | | CQADupstackRetrieval-Fa | ['fas'] | Retrieval | None | [Web] | None | None | -| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Academic, Non-fiction] | None | None | +| [CQADupstackStatsRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [CQADupstackStatsRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Non-fiction] | None | None | +| [CQADupstackTexRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Non-fiction, Written] | None | None | | [CQADupstackTexRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web, Programming] | None | None | +| [CQADupstackUnixRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackUnixRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web] | None | None | +| [CQADupstackWebmastersRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [CQADupstackWebmastersRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Written, Web, Programming] | None | None | +| [CQADupstackWordpressRetrieval](http://nlp.cis.unimelb.edu.au/resources/cqadupstack/) (Hoogeveen et al., 2015) | ['eng'] | Retrieval | s2p | [Programming, Web, Written] | None | None | | [CQADupstackWordpressRetrieval-Fa](https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [CSFDCZMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CSFDSKMovieReviewSentimentClassification](https://arxiv.org/abs/2304.01922) (Michal Štefánik, 2023) | ['slk'] | Classification | s2s | [Reviews, Written] | None | None | @@ -136,9 +136,9 @@ The following tables give you an overview of the tasks in MTEB. | [CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUADVolumeRestrictionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [CUADWarrantyDurationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [CUREv1](https://huggingface.co/datasets/clinia/CUREv1) | ['eng', 'fra', 'spa'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [CanadaTaxCourtOutcomesLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Social, Government, Written] | None | None | +| [CataloniaTweetClassification](https://aclanthology.org/2020.lrec-1.171/) | ['cat', 'spa'] | Classification | s2s | [Government, Social, Written] | None | None | | [ChemHotpotQARetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ChemNQRetrieval](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Retrieval | s2p | [Chemistry] | None | None | | [ClimateFEVER](https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | @@ -177,11 +177,11 @@ The following tables give you an overview of the tasks in MTEB. | [CzechProductReviewSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSoMeSentimentClassification](https://aclanthology.org/W13-1609/) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | | [CzechSubjectivityClassification](https://arxiv.org/abs/2009.08712) | ['ces'] | Classification | s2s | [Reviews, Written] | None | None | -| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBPedia-Fa](https://huggingface.co/datasets/MCINext/dbpedia-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | -| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | -| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | -| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [DBPedia-PL](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPedia-PLHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['pol'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [DBPediaHardNegatives](https://github.com/iai-group/DBpedia-Entity/) (Hasibi et al., 2017) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [DBpediaClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Encyclopaedic, Written] | None | None | | [DKHateClassification](https://aclanthology.org/2020.lrec-1.430/) | ['dan'] | Classification | s2s | [Social, Written] | None | None | | [DalajClassification](https://spraakbanken.gu.se/en/resources/superlim) | ['swe'] | Classification | s2s | [Non-fiction, Written] | None | None | @@ -215,16 +215,16 @@ The following tables give you an overview of the tasks in MTEB. | [FarsTail](https://link.springer.com/article/10.1007/s00500-023-08959-3) (Amirkhani et al., 2023) | ['fas'] | PairClassification | s2s | [Academic, Written] | None | None | | [FarsiParaphraseDetection](https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection) | ['fas'] | PairClassification | s2s | | None | None | | [Farsick](https://github.com/ZahraGhasemi-AI/FarSick) | ['fas'] | STS | s2s | | None | None | -| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Web, Government, Medical, Written] | None | None | -| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Written, Financial] | None | None | -| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Written, Financial] | None | None | +| [FeedbackQARetrieval](https://arxiv.org/abs/2204.03025) | ['eng'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | +| [FiQA-PL](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['pol'] | Retrieval | s2p | [Financial, Written] | None | None | +| [FiQA2018](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Financial, Written] | None | None | | [FiQA2018-Fa](https://huggingface.co/datasets/MCINext/fiqa-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [FilipinoHateSpeechClassification](https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019) (Neil Vicente Cabasag et al., 2019) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FilipinoShopeeReviewsClassification](https://uijrt.com/articles/v4/i8/UIJRTV4I80009.pdf) | ['fil'] | Classification | s2s | [Social, Written] | None | None | | [FinParaSTS](https://huggingface.co/datasets/TurkuNLP/turku_paraphrase_corpus) | ['fin'] | STS | s2s | [News, Subtitles, Written] | None | None | | [FinToxicityClassification](https://aclanthology.org/2023.nodalida-1.68) | ['fin'] | Classification | s2s | [News, Written] | None | None | -| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [News, Written, Financial] | None | None | -| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Non-fiction, Encyclopaedic, Written] | None | None | +| [FinancialPhrasebankClassification](https://arxiv.org/abs/1307.5336) (P. Malo, 2014) | ['eng'] | Classification | s2s | [Financial, News, Written] | None | None | +| [FloresBitextMining](https://huggingface.co/datasets/facebook/flores) (Goyal et al., 2022) | ['ace', 'acm', 'acq', 'aeb', 'afr', 'ajp', 'aka', 'als', 'amh', 'apc', 'arb', 'ars', 'ary', 'arz', 'asm', 'ast', 'awa', 'ayr', 'azb', 'azj', 'bak', 'bam', 'ban', 'bel', 'bem', 'ben', 'bho', 'bjn', 'bod', 'bos', 'bug', 'bul', 'cat', 'ceb', 'ces', 'cjk', 'ckb', 'crh', 'cym', 'dan', 'deu', 'dik', 'dyu', 'dzo', 'ell', 'eng', 'epo', 'est', 'eus', 'ewe', 'fao', 'fij', 'fin', 'fon', 'fra', 'fur', 'fuv', 'gaz', 'gla', 'gle', 'glg', 'grn', 'guj', 'hat', 'hau', 'heb', 'hin', 'hne', 'hrv', 'hun', 'hye', 'ibo', 'ilo', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kab', 'kac', 'kam', 'kan', 'kas', 'kat', 'kaz', 'kbp', 'kea', 'khk', 'khm', 'kik', 'kin', 'kir', 'kmb', 'kmr', 'knc', 'kon', 'kor', 'lao', 'lij', 'lim', 'lin', 'lit', 'lmo', 'ltg', 'ltz', 'lua', 'lug', 'luo', 'lus', 'lvs', 'mag', 'mai', 'mal', 'mar', 'min', 'mkd', 'mlt', 'mni', 'mos', 'mri', 'mya', 'nld', 'nno', 'nob', 'npi', 'nso', 'nus', 'nya', 'oci', 'ory', 'pag', 'pan', 'pap', 'pbt', 'pes', 'plt', 'pol', 'por', 'prs', 'quy', 'ron', 'run', 'rus', 'sag', 'san', 'sat', 'scn', 'shn', 'sin', 'slk', 'slv', 'smo', 'sna', 'snd', 'som', 'sot', 'spa', 'srd', 'srp', 'ssw', 'sun', 'swe', 'swh', 'szl', 'tam', 'taq', 'tat', 'tel', 'tgk', 'tgl', 'tha', 'tir', 'tpi', 'tsn', 'tso', 'tuk', 'tum', 'tur', 'twi', 'tzm', 'uig', 'ukr', 'umb', 'urd', 'uzn', 'vec', 'vie', 'war', 'wol', 'xho', 'ydd', 'yor', 'yue', 'zho', 'zsm', 'zul'] | BitextMining | s2s | [Encyclopaedic, Non-fiction, Written] | None | None | | [FrenchBookReviews](https://huggingface.co/datasets/Abirate/french_book_reviews) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [FrenkEnClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [FrenkHrClassification](https://arxiv.org/abs/1906.02045) (Nikola Ljubešić, 2019) | ['hrv'] | Classification | s2s | [Social, Written] | None | None | @@ -238,7 +238,7 @@ The following tables give you an overview of the tasks in MTEB. | [GerDaLIRSmall](https://github.com/lavis-nlp/GerDaLIR) | ['deu'] | Retrieval | p2p | [Legal, Written] | None | None | | [GermanDPR](https://huggingface.co/datasets/deepset/germandpr) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | | [GermanGovServiceRetrieval](https://huggingface.co/datasets/it-at-m/LHM-Dienstleistungen-QA) | ['deu'] | Retrieval | s2p | [Government, Written] | None | None | -| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Social, Government, Written] | None | None | +| [GermanPoliticiansTwitterSentimentClassification](https://aclanthology.org/2022.konvens-1.9) | ['deu'] | Classification | s2s | [Government, Social, Written] | None | None | | [GermanQuAD-Retrieval](https://www.kaggle.com/datasets/GermanQuAD) (Timo Möller, 2021) | ['deu'] | Retrieval | s2p | | None | None | | [GermanSTSBenchmark](https://github.com/t-systems-on-site-services-gmbh/german-STSbenchmark) (Philip May, 2021) | ['deu'] | STS | s2s | | None | None | | [GreekCivicsQA](https://huggingface.co/datasets/antoinelb7/alloprof) | ['ell'] | Retrieval | s2p | [Academic, Written] | None | None | @@ -261,14 +261,14 @@ The following tables give you an overview of the tasks in MTEB. | [HotpotQAHardNegatives](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [HunSum2AbstractiveRetrieval](https://arxiv.org/abs/2404.03555) (Botond Barta, 2024) | ['hun'] | Retrieval | s2p | [News, Written] | None | None | | [IFlyTek](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | -| [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Social, Spoken, Fiction, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | -| [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Web, Legal, Government, News, Religious, Non-fiction, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | -| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Non-fiction, Fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | +| [IN22ConvBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Conv) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Fiction, Social, Spoken, Spoken] | {'test': 760518} | {'test': {'num_samples': 760518, 'number_of_characters': 82637104, 'unique_pairs': 759283, 'min_sentence1_length': 3, 'average_sentence1_length': 54.33, 'max_sentence1_length': 239, 'unique_sentence1': 34430, 'min_sentence2_length': 3, 'average_sentence2_length': 54.33, 'max_sentence2_length': 239, 'unique_sentence2': 34430, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'asm_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'asm_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'asm_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'asm_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'asm_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'asm_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'asm_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'asm_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'asm_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'asm_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'asm_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'asm_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'asm_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'asm_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'asm_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'asm_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'asm_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'asm_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'asm_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'asm_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'asm_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.75, 'max_sentence1_length': 208, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ben_Beng-asm_Beng': {'num_samples': 1503, 'number_of_characters': 155988, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ben_Beng-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ben_Beng-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ben_Beng-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ben_Beng-gom_Deva': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ben_Beng-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ben_Beng-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ben_Beng-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ben_Beng-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ben_Beng-mai_Deva': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ben_Beng-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ben_Beng-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ben_Beng-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ben_Beng-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ben_Beng-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'ben_Beng-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ben_Beng-san_Deva': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ben_Beng-sat_Olck': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ben_Beng-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ben_Beng-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ben_Beng-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ben_Beng-urd_Arab': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.03, 'max_sentence1_length': 178, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'brx_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162044, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'brx_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'brx_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'brx_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'brx_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'brx_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'brx_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'brx_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'brx_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'brx_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'brx_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'brx_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'brx_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'brx_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'brx_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'brx_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'brx_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'brx_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'brx_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'brx_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'brx_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'brx_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.06, 'max_sentence1_length': 210, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'doi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 167032, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'doi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 161436, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'doi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 167492, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'doi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'doi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'doi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'doi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'doi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'doi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'doi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'doi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'doi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'doi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'doi_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'doi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'doi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'doi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'doi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'doi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'doi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'doi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'doi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 57.38, 'max_sentence1_length': 209, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'eng_Latn-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160716, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'eng_Latn-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155120, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'eng_Latn-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161176, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'eng_Latn-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166164, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'eng_Latn-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'eng_Latn-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'eng_Latn-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'eng_Latn-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'eng_Latn-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'eng_Latn-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'eng_Latn-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'eng_Latn-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'eng_Latn-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'eng_Latn-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'eng_Latn-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'eng_Latn-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'eng_Latn-san_Deva': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'eng_Latn-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'eng_Latn-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'eng_Latn-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'eng_Latn-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'eng_Latn-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.18, 'max_sentence1_length': 201, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'gom_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 156282, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'gom_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 150686, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'gom_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 156742, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'gom_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'gom_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 155414, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'gom_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'gom_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'gom_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'gom_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'gom_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'gom_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'gom_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'gom_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'gom_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'gom_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'gom_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'gom_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'gom_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'gom_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'gom_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'gom_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'gom_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 50.23, 'max_sentence1_length': 203, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'guj_Gujr-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158269, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'guj_Gujr-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152673, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'guj_Gujr-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'guj_Gujr-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163717, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'guj_Gujr-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157401, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'guj_Gujr-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152967, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'guj_Gujr-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'guj_Gujr-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'guj_Gujr-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'guj_Gujr-mai_Deva': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'guj_Gujr-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'guj_Gujr-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'guj_Gujr-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'guj_Gujr-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'guj_Gujr-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'guj_Gujr-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'guj_Gujr-san_Deva': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'guj_Gujr-sat_Olck': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'guj_Gujr-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'guj_Gujr-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'guj_Gujr-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'guj_Gujr-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 51.55, 'max_sentence1_length': 205, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'hin_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 159964, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'hin_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154368, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'hin_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160424, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'hin_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165412, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'hin_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159096, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'hin_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154662, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'hin_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156649, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'hin_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'hin_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'hin_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'hin_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'hin_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'hin_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'hin_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'hin_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'hin_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'hin_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'hin_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'hin_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'hin_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'hin_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'hin_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.68, 'max_sentence1_length': 192, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kan_Knda-asm_Beng': {'num_samples': 1503, 'number_of_characters': 165177, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kan_Knda-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159581, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kan_Knda-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165637, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kan_Knda-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170625, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kan_Knda-eng_Latn': {'num_samples': 1503, 'number_of_characters': 164309, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kan_Knda-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159875, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kan_Knda-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161862, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kan_Knda-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163557, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kan_Knda-kas_Arab': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'kan_Knda-mai_Deva': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kan_Knda-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kan_Knda-mar_Deva': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kan_Knda-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kan_Knda-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kan_Knda-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kan_Knda-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kan_Knda-san_Deva': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kan_Knda-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kan_Knda-snd_Deva': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kan_Knda-tam_Taml': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kan_Knda-tel_Telu': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kan_Knda-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 56.14, 'max_sentence1_length': 201, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'kas_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164681, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'kas_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 159085, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'kas_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 165141, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'kas_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 170129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'kas_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163813, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'kas_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 159379, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'kas_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'kas_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 163061, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'kas_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 168274, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'kas_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'kas_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'kas_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'kas_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'kas_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'kas_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'kas_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'kas_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'kas_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'kas_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'kas_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'kas_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'kas_Arab-urd_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 55.81, 'max_sentence1_length': 203, 'unique_sentence1': 1502, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mai_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162408, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mai_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 156812, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mai_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 162868, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mai_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 167856, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mai_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161540, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mai_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157106, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mai_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159093, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mai_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 160788, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mai_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166001, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mai_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165505, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mai_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mai_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mai_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mai_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mai_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mai_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mai_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mai_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mai_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mai_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mai_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mai_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 54.3, 'max_sentence1_length': 230, 'unique_sentence1': 1499, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mal_Mlym-asm_Beng': {'num_samples': 1503, 'number_of_characters': 172838, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mal_Mlym-ben_Beng': {'num_samples': 1503, 'number_of_characters': 167242, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mal_Mlym-brx_Deva': {'num_samples': 1503, 'number_of_characters': 173298, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mal_Mlym-doi_Deva': {'num_samples': 1503, 'number_of_characters': 178286, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mal_Mlym-eng_Latn': {'num_samples': 1503, 'number_of_characters': 171970, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mal_Mlym-gom_Deva': {'num_samples': 1503, 'number_of_characters': 167536, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mal_Mlym-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 169523, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mal_Mlym-hin_Deva': {'num_samples': 1503, 'number_of_characters': 171218, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mal_Mlym-kan_Knda': {'num_samples': 1503, 'number_of_characters': 176431, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mal_Mlym-kas_Arab': {'num_samples': 1503, 'number_of_characters': 175935, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mal_Mlym-mai_Deva': {'num_samples': 1503, 'number_of_characters': 173662, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mal_Mlym-mar_Deva': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mal_Mlym-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mal_Mlym-npi_Deva': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mal_Mlym-ory_Orya': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mal_Mlym-pan_Guru': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mal_Mlym-san_Deva': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mal_Mlym-sat_Olck': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mal_Mlym-snd_Deva': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mal_Mlym-tam_Taml': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mal_Mlym-tel_Telu': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mal_Mlym-urd_Arab': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 5, 'average_sentence1_length': 61.24, 'max_sentence1_length': 219, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mar_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162747, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mar_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157151, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mar_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163207, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mar_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168195, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mar_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161879, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mar_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157445, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mar_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159432, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mar_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161127, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mar_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166340, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mar_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165844, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mar_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163571, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mar_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 174001, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mar_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'mar_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mar_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mar_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mar_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mar_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mar_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mar_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mar_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mar_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.53, 'max_sentence1_length': 221, 'unique_sentence1': 1501, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'mni_Mtei-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157316, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'mni_Mtei-ben_Beng': {'num_samples': 1503, 'number_of_characters': 151720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'mni_Mtei-brx_Deva': {'num_samples': 1503, 'number_of_characters': 157776, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'mni_Mtei-doi_Deva': {'num_samples': 1503, 'number_of_characters': 162764, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'mni_Mtei-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156448, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'mni_Mtei-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152014, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'mni_Mtei-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154001, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'mni_Mtei-hin_Deva': {'num_samples': 1503, 'number_of_characters': 155696, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'mni_Mtei-kan_Knda': {'num_samples': 1503, 'number_of_characters': 160909, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'mni_Mtei-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160413, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'mni_Mtei-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158140, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'mni_Mtei-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168570, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'mni_Mtei-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158479, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'mni_Mtei-npi_Deva': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'mni_Mtei-ory_Orya': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'mni_Mtei-pan_Guru': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'mni_Mtei-san_Deva': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'mni_Mtei-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'mni_Mtei-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'mni_Mtei-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'mni_Mtei-tel_Telu': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'mni_Mtei-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 50.91, 'max_sentence1_length': 239, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'npi_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160906, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'npi_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155310, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'npi_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161366, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'npi_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166354, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'npi_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160038, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'npi_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 155604, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'npi_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157591, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'npi_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159286, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'npi_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164499, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'npi_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'npi_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161730, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'npi_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172160, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'npi_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162069, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'npi_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 156638, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'npi_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'npi_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'npi_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'npi_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'npi_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'npi_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'npi_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'npi_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.3, 'max_sentence1_length': 223, 'unique_sentence1': 1497, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'ory_Orya-asm_Beng': {'num_samples': 1503, 'number_of_characters': 164223, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'ory_Orya-ben_Beng': {'num_samples': 1503, 'number_of_characters': 158627, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'ory_Orya-brx_Deva': {'num_samples': 1503, 'number_of_characters': 164683, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'ory_Orya-doi_Deva': {'num_samples': 1503, 'number_of_characters': 169671, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'ory_Orya-eng_Latn': {'num_samples': 1503, 'number_of_characters': 163355, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'ory_Orya-gom_Deva': {'num_samples': 1503, 'number_of_characters': 158921, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'ory_Orya-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 160908, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'ory_Orya-hin_Deva': {'num_samples': 1503, 'number_of_characters': 162603, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'ory_Orya-kan_Knda': {'num_samples': 1503, 'number_of_characters': 167816, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'ory_Orya-kas_Arab': {'num_samples': 1503, 'number_of_characters': 167320, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'ory_Orya-mai_Deva': {'num_samples': 1503, 'number_of_characters': 165047, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'ory_Orya-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 175477, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'ory_Orya-mar_Deva': {'num_samples': 1503, 'number_of_characters': 165386, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'ory_Orya-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 159955, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'ory_Orya-npi_Deva': {'num_samples': 1503, 'number_of_characters': 163545, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'ory_Orya-pan_Guru': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'ory_Orya-san_Deva': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'ory_Orya-sat_Olck': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'ory_Orya-snd_Deva': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'ory_Orya-tam_Taml': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'ory_Orya-tel_Telu': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'ory_Orya-urd_Arab': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 55.51, 'max_sentence1_length': 195, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'pan_Guru-asm_Beng': {'num_samples': 1503, 'number_of_characters': 160201, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'pan_Guru-ben_Beng': {'num_samples': 1503, 'number_of_characters': 154605, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'pan_Guru-brx_Deva': {'num_samples': 1503, 'number_of_characters': 160661, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'pan_Guru-doi_Deva': {'num_samples': 1503, 'number_of_characters': 165649, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'pan_Guru-eng_Latn': {'num_samples': 1503, 'number_of_characters': 159333, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'pan_Guru-gom_Deva': {'num_samples': 1503, 'number_of_characters': 154899, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'pan_Guru-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 156886, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'pan_Guru-hin_Deva': {'num_samples': 1503, 'number_of_characters': 158581, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'pan_Guru-kan_Knda': {'num_samples': 1503, 'number_of_characters': 163794, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'pan_Guru-kas_Arab': {'num_samples': 1503, 'number_of_characters': 163298, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'pan_Guru-mai_Deva': {'num_samples': 1503, 'number_of_characters': 161025, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'pan_Guru-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 171455, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'pan_Guru-mar_Deva': {'num_samples': 1503, 'number_of_characters': 161364, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'pan_Guru-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 155933, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'pan_Guru-npi_Deva': {'num_samples': 1503, 'number_of_characters': 159523, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'pan_Guru-ory_Orya': {'num_samples': 1503, 'number_of_characters': 162840, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'pan_Guru-san_Deva': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'pan_Guru-sat_Olck': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'pan_Guru-snd_Deva': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'pan_Guru-tam_Taml': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'pan_Guru-tel_Telu': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'pan_Guru-urd_Arab': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 52.83, 'max_sentence1_length': 221, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'san_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 158093, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'san_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152497, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'san_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158553, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'san_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163541, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'san_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 157225, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'san_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152791, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'san_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154778, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'san_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156473, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'san_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161686, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'san_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 161190, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'san_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158917, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'san_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 169347, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'san_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 159256, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'san_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153825, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'san_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157415, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'san_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160732, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'san_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156710, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'san_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'san_Deva-snd_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'san_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'san_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'san_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 51.43, 'max_sentence1_length': 181, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'sat_Olck-asm_Beng': {'num_samples': 1503, 'number_of_characters': 169379, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'sat_Olck-ben_Beng': {'num_samples': 1503, 'number_of_characters': 163783, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'sat_Olck-brx_Deva': {'num_samples': 1503, 'number_of_characters': 169839, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'sat_Olck-doi_Deva': {'num_samples': 1503, 'number_of_characters': 174827, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'sat_Olck-eng_Latn': {'num_samples': 1503, 'number_of_characters': 168511, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'sat_Olck-gom_Deva': {'num_samples': 1503, 'number_of_characters': 164077, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'sat_Olck-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 166064, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'sat_Olck-hin_Deva': {'num_samples': 1503, 'number_of_characters': 167759, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'sat_Olck-kan_Knda': {'num_samples': 1503, 'number_of_characters': 172972, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'sat_Olck-kas_Arab': {'num_samples': 1503, 'number_of_characters': 172476, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'sat_Olck-mai_Deva': {'num_samples': 1503, 'number_of_characters': 170203, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'sat_Olck-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 180633, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'sat_Olck-mar_Deva': {'num_samples': 1503, 'number_of_characters': 170542, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'sat_Olck-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 165111, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'sat_Olck-npi_Deva': {'num_samples': 1503, 'number_of_characters': 168701, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'sat_Olck-ory_Orya': {'num_samples': 1503, 'number_of_characters': 172018, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'sat_Olck-pan_Guru': {'num_samples': 1503, 'number_of_characters': 167996, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'sat_Olck-san_Deva': {'num_samples': 1503, 'number_of_characters': 165888, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'sat_Olck-snd_Deva': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'sat_Olck-tam_Taml': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'sat_Olck-tel_Telu': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'sat_Olck-urd_Arab': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 7, 'average_sentence1_length': 58.94, 'max_sentence1_length': 225, 'unique_sentence1': 1500, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'snd_Deva-asm_Beng': {'num_samples': 1503, 'number_of_characters': 162623, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'snd_Deva-ben_Beng': {'num_samples': 1503, 'number_of_characters': 157027, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'snd_Deva-brx_Deva': {'num_samples': 1503, 'number_of_characters': 163083, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'snd_Deva-doi_Deva': {'num_samples': 1503, 'number_of_characters': 168071, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'snd_Deva-eng_Latn': {'num_samples': 1503, 'number_of_characters': 161755, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'snd_Deva-gom_Deva': {'num_samples': 1503, 'number_of_characters': 157321, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'snd_Deva-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 159308, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'snd_Deva-hin_Deva': {'num_samples': 1503, 'number_of_characters': 161003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'snd_Deva-kan_Knda': {'num_samples': 1503, 'number_of_characters': 166216, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'snd_Deva-kas_Arab': {'num_samples': 1503, 'number_of_characters': 165720, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'snd_Deva-mai_Deva': {'num_samples': 1503, 'number_of_characters': 163447, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'snd_Deva-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 173877, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'snd_Deva-mar_Deva': {'num_samples': 1503, 'number_of_characters': 163786, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'snd_Deva-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 158355, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'snd_Deva-npi_Deva': {'num_samples': 1503, 'number_of_characters': 161945, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'snd_Deva-ory_Orya': {'num_samples': 1503, 'number_of_characters': 165262, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'snd_Deva-pan_Guru': {'num_samples': 1503, 'number_of_characters': 161240, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'snd_Deva-san_Deva': {'num_samples': 1503, 'number_of_characters': 159132, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'snd_Deva-sat_Olck': {'num_samples': 1503, 'number_of_characters': 170418, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'snd_Deva-tam_Taml': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'snd_Deva-tel_Telu': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'snd_Deva-urd_Arab': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 54.45, 'max_sentence1_length': 195, 'unique_sentence1': 1490, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tam_Taml-asm_Beng': {'num_samples': 1503, 'number_of_characters': 174866, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tam_Taml-ben_Beng': {'num_samples': 1503, 'number_of_characters': 169270, 'unique_pairs': 1501, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tam_Taml-brx_Deva': {'num_samples': 1503, 'number_of_characters': 175326, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tam_Taml-doi_Deva': {'num_samples': 1503, 'number_of_characters': 180314, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tam_Taml-eng_Latn': {'num_samples': 1503, 'number_of_characters': 173998, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tam_Taml-gom_Deva': {'num_samples': 1503, 'number_of_characters': 169564, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tam_Taml-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 171551, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tam_Taml-hin_Deva': {'num_samples': 1503, 'number_of_characters': 173246, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tam_Taml-kan_Knda': {'num_samples': 1503, 'number_of_characters': 178459, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tam_Taml-kas_Arab': {'num_samples': 1503, 'number_of_characters': 177963, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tam_Taml-mai_Deva': {'num_samples': 1503, 'number_of_characters': 175690, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tam_Taml-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 186120, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tam_Taml-mar_Deva': {'num_samples': 1503, 'number_of_characters': 176029, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tam_Taml-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 170598, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tam_Taml-npi_Deva': {'num_samples': 1503, 'number_of_characters': 174188, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tam_Taml-ory_Orya': {'num_samples': 1503, 'number_of_characters': 177505, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tam_Taml-pan_Guru': {'num_samples': 1503, 'number_of_characters': 173483, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tam_Taml-san_Deva': {'num_samples': 1503, 'number_of_characters': 171375, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tam_Taml-sat_Olck': {'num_samples': 1503, 'number_of_characters': 182661, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tam_Taml-snd_Deva': {'num_samples': 1503, 'number_of_characters': 175905, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tam_Taml-tel_Telu': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}, 'tam_Taml-urd_Arab': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 3, 'average_sentence1_length': 62.59, 'max_sentence1_length': 224, 'unique_sentence1': 1492, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'tel_Telu-asm_Beng': {'num_samples': 1503, 'number_of_characters': 157690, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'tel_Telu-ben_Beng': {'num_samples': 1503, 'number_of_characters': 152094, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'tel_Telu-brx_Deva': {'num_samples': 1503, 'number_of_characters': 158150, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'tel_Telu-doi_Deva': {'num_samples': 1503, 'number_of_characters': 163138, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'tel_Telu-eng_Latn': {'num_samples': 1503, 'number_of_characters': 156822, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'tel_Telu-gom_Deva': {'num_samples': 1503, 'number_of_characters': 152388, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'tel_Telu-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 154375, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'tel_Telu-hin_Deva': {'num_samples': 1503, 'number_of_characters': 156070, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'tel_Telu-kan_Knda': {'num_samples': 1503, 'number_of_characters': 161283, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'tel_Telu-kas_Arab': {'num_samples': 1503, 'number_of_characters': 160787, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'tel_Telu-mai_Deva': {'num_samples': 1503, 'number_of_characters': 158514, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'tel_Telu-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 168944, 'unique_pairs': 1500, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'tel_Telu-mar_Deva': {'num_samples': 1503, 'number_of_characters': 158853, 'unique_pairs': 1503, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'tel_Telu-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 153422, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'tel_Telu-npi_Deva': {'num_samples': 1503, 'number_of_characters': 157012, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'tel_Telu-ory_Orya': {'num_samples': 1503, 'number_of_characters': 160329, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'tel_Telu-pan_Guru': {'num_samples': 1503, 'number_of_characters': 156307, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'tel_Telu-san_Deva': {'num_samples': 1503, 'number_of_characters': 154199, 'unique_pairs': 1501, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'tel_Telu-sat_Olck': {'num_samples': 1503, 'number_of_characters': 165485, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'tel_Telu-snd_Deva': {'num_samples': 1503, 'number_of_characters': 158729, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'tel_Telu-tam_Taml': {'num_samples': 1503, 'number_of_characters': 170972, 'unique_pairs': 1502, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'tel_Telu-urd_Arab': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 6, 'average_sentence1_length': 51.16, 'max_sentence1_length': 182, 'unique_sentence1': 1495, 'min_sentence2_length': 4, 'average_sentence2_length': 53.57, 'max_sentence2_length': 206, 'unique_sentence2': 1498}, 'urd_Arab-asm_Beng': {'num_samples': 1503, 'number_of_characters': 161305, 'unique_pairs': 1498, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.75, 'max_sentence2_length': 208, 'unique_sentence2': 1497}, 'urd_Arab-ben_Beng': {'num_samples': 1503, 'number_of_characters': 155709, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.03, 'max_sentence2_length': 178, 'unique_sentence2': 1497}, 'urd_Arab-brx_Deva': {'num_samples': 1503, 'number_of_characters': 161765, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.06, 'max_sentence2_length': 210, 'unique_sentence2': 1498}, 'urd_Arab-doi_Deva': {'num_samples': 1503, 'number_of_characters': 166753, 'unique_pairs': 1500, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 57.38, 'max_sentence2_length': 209, 'unique_sentence2': 1499}, 'urd_Arab-eng_Latn': {'num_samples': 1503, 'number_of_characters': 160437, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.18, 'max_sentence2_length': 201, 'unique_sentence2': 1497}, 'urd_Arab-gom_Deva': {'num_samples': 1503, 'number_of_characters': 156003, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 50.23, 'max_sentence2_length': 203, 'unique_sentence2': 1500}, 'urd_Arab-guj_Gujr': {'num_samples': 1503, 'number_of_characters': 157990, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 51.55, 'max_sentence2_length': 205, 'unique_sentence2': 1500}, 'urd_Arab-hin_Deva': {'num_samples': 1503, 'number_of_characters': 159685, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.68, 'max_sentence2_length': 192, 'unique_sentence2': 1497}, 'urd_Arab-kan_Knda': {'num_samples': 1503, 'number_of_characters': 164898, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 56.14, 'max_sentence2_length': 201, 'unique_sentence2': 1499}, 'urd_Arab-kas_Arab': {'num_samples': 1503, 'number_of_characters': 164402, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 55.81, 'max_sentence2_length': 203, 'unique_sentence2': 1502}, 'urd_Arab-mai_Deva': {'num_samples': 1503, 'number_of_characters': 162129, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 54.3, 'max_sentence2_length': 230, 'unique_sentence2': 1499}, 'urd_Arab-mal_Mlym': {'num_samples': 1503, 'number_of_characters': 172559, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 5, 'average_sentence2_length': 61.24, 'max_sentence2_length': 219, 'unique_sentence2': 1495}, 'urd_Arab-mar_Deva': {'num_samples': 1503, 'number_of_characters': 162468, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.53, 'max_sentence2_length': 221, 'unique_sentence2': 1501}, 'urd_Arab-mni_Mtei': {'num_samples': 1503, 'number_of_characters': 157037, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 50.91, 'max_sentence2_length': 239, 'unique_sentence2': 1498}, 'urd_Arab-npi_Deva': {'num_samples': 1503, 'number_of_characters': 160627, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 53.3, 'max_sentence2_length': 223, 'unique_sentence2': 1497}, 'urd_Arab-ory_Orya': {'num_samples': 1503, 'number_of_characters': 163944, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 55.51, 'max_sentence2_length': 195, 'unique_sentence2': 1500}, 'urd_Arab-pan_Guru': {'num_samples': 1503, 'number_of_characters': 159922, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 52.83, 'max_sentence2_length': 221, 'unique_sentence2': 1495}, 'urd_Arab-san_Deva': {'num_samples': 1503, 'number_of_characters': 157814, 'unique_pairs': 1501, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 51.43, 'max_sentence2_length': 181, 'unique_sentence2': 1500}, 'urd_Arab-sat_Olck': {'num_samples': 1503, 'number_of_characters': 169100, 'unique_pairs': 1502, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 7, 'average_sentence2_length': 58.94, 'max_sentence2_length': 225, 'unique_sentence2': 1500}, 'urd_Arab-snd_Deva': {'num_samples': 1503, 'number_of_characters': 162344, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 4, 'average_sentence2_length': 54.45, 'max_sentence2_length': 195, 'unique_sentence2': 1490}, 'urd_Arab-tam_Taml': {'num_samples': 1503, 'number_of_characters': 174587, 'unique_pairs': 1503, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 3, 'average_sentence2_length': 62.59, 'max_sentence2_length': 224, 'unique_sentence2': 1492}, 'urd_Arab-tel_Telu': {'num_samples': 1503, 'number_of_characters': 157411, 'unique_pairs': 1499, 'min_sentence1_length': 4, 'average_sentence1_length': 53.57, 'max_sentence1_length': 206, 'unique_sentence1': 1498, 'min_sentence2_length': 6, 'average_sentence2_length': 51.16, 'max_sentence2_length': 182, 'unique_sentence2': 1495}}}} | +| [IN22GenBitextMining](https://huggingface.co/datasets/ai4bharat/IN22-Gen) (Jay Gala, 2023) | ['asm', 'ben', 'brx', 'doi', 'eng', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Government, Legal, News, Non-fiction, Religious, Web, Written] | {'test': 518144} | {'test': {'num_samples': 518144, 'number_of_characters': 162367876, 'unique_pairs': 518101, 'min_sentence1_length': 9, 'average_sentence1_length': 156.68, 'max_sentence1_length': 692, 'unique_sentence1': 23550, 'min_sentence2_length': 9, 'average_sentence2_length': 156.68, 'max_sentence2_length': 692, 'unique_sentence2': 23550, 'hf_subset_descriptive_stats': {'asm_Beng-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'asm_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'asm_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'asm_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'asm_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'asm_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'asm_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'asm_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'asm_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'asm_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'asm_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'asm_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'asm_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'asm_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'asm_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'asm_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'asm_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'asm_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'asm_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'asm_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'asm_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'asm_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 156.7, 'max_sentence1_length': 582, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ben_Beng-asm_Beng': {'num_samples': 1024, 'number_of_characters': 310622, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ben_Beng-brx_Deva': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ben_Beng-doi_Deva': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ben_Beng-eng_Latn': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ben_Beng-gom_Deva': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ben_Beng-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ben_Beng-hin_Deva': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ben_Beng-kan_Knda': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ben_Beng-kas_Arab': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ben_Beng-mai_Deva': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ben_Beng-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ben_Beng-mar_Deva': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ben_Beng-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ben_Beng-npi_Deva': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ben_Beng-ory_Orya': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'ben_Beng-pan_Guru': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ben_Beng-san_Deva': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ben_Beng-sat_Olck': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ben_Beng-snd_Deva': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ben_Beng-tam_Taml': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ben_Beng-tel_Telu': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ben_Beng-urd_Arab': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 146.64, 'max_sentence1_length': 538, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'brx_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 323609, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'brx_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 313313, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'brx_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'brx_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'brx_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'brx_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'brx_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'brx_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'brx_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'brx_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'brx_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'brx_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'brx_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'brx_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'brx_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'brx_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'brx_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'brx_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'brx_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'brx_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'brx_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'brx_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 159.33, 'max_sentence1_length': 631, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'doi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319020, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'doi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308724, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'doi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321711, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'doi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'doi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'doi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'doi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'doi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'doi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'doi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'doi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'doi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'doi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'doi_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'doi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'doi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'doi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'doi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'doi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'doi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'doi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'doi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.84, 'max_sentence1_length': 500, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'eng_Latn-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320098, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'eng_Latn-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309802, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'eng_Latn-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322789, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'eng_Latn-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318200, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'eng_Latn-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'eng_Latn-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'eng_Latn-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'eng_Latn-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'eng_Latn-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'eng_Latn-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'eng_Latn-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'eng_Latn-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'eng_Latn-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'eng_Latn-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'eng_Latn-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'eng_Latn-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'eng_Latn-san_Deva': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'eng_Latn-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'eng_Latn-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'eng_Latn-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'eng_Latn-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'eng_Latn-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 155.9, 'max_sentence1_length': 532, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'gom_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 312594, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'gom_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302298, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'gom_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315285, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'gom_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 310696, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'gom_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 311774, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'gom_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'gom_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'gom_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'gom_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'gom_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'gom_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'gom_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'gom_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'gom_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'gom_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'gom_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'gom_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'gom_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'gom_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'gom_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'gom_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'gom_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 17, 'average_sentence1_length': 148.57, 'max_sentence1_length': 537, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'guj_Gujr-asm_Beng': {'num_samples': 1024, 'number_of_characters': 309440, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'guj_Gujr-ben_Beng': {'num_samples': 1024, 'number_of_characters': 299144, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'guj_Gujr-brx_Deva': {'num_samples': 1024, 'number_of_characters': 312131, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'guj_Gujr-doi_Deva': {'num_samples': 1024, 'number_of_characters': 307542, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'guj_Gujr-eng_Latn': {'num_samples': 1024, 'number_of_characters': 308620, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'guj_Gujr-gom_Deva': {'num_samples': 1024, 'number_of_characters': 301116, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'guj_Gujr-hin_Deva': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'guj_Gujr-kan_Knda': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'guj_Gujr-kas_Arab': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'guj_Gujr-mai_Deva': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'guj_Gujr-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'guj_Gujr-mar_Deva': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'guj_Gujr-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'guj_Gujr-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'guj_Gujr-ory_Orya': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'guj_Gujr-pan_Guru': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'guj_Gujr-san_Deva': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'guj_Gujr-sat_Olck': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'guj_Gujr-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'guj_Gujr-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'guj_Gujr-tel_Telu': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'guj_Gujr-urd_Arab': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 145.49, 'max_sentence1_length': 488, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'hin_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320106, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'hin_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 309810, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'hin_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 322797, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'hin_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318208, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'hin_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319286, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'hin_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 311782, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'hin_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308628, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'hin_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'hin_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'hin_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'hin_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'hin_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'hin_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'hin_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'hin_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'hin_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'hin_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'hin_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'hin_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'hin_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'hin_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'hin_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 21, 'average_sentence1_length': 155.91, 'max_sentence1_length': 531, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kan_Knda-asm_Beng': {'num_samples': 1024, 'number_of_characters': 332064, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kan_Knda-ben_Beng': {'num_samples': 1024, 'number_of_characters': 321768, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kan_Knda-brx_Deva': {'num_samples': 1024, 'number_of_characters': 334755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kan_Knda-doi_Deva': {'num_samples': 1024, 'number_of_characters': 330166, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kan_Knda-eng_Latn': {'num_samples': 1024, 'number_of_characters': 331244, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kan_Knda-gom_Deva': {'num_samples': 1024, 'number_of_characters': 323740, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kan_Knda-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 320586, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kan_Knda-hin_Deva': {'num_samples': 1024, 'number_of_characters': 331252, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kan_Knda-kas_Arab': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'kan_Knda-mai_Deva': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kan_Knda-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kan_Knda-mar_Deva': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kan_Knda-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kan_Knda-npi_Deva': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kan_Knda-ory_Orya': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kan_Knda-pan_Guru': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kan_Knda-san_Deva': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kan_Knda-sat_Olck': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kan_Knda-snd_Deva': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kan_Knda-tam_Taml': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kan_Knda-tel_Telu': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kan_Knda-urd_Arab': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 167.58, 'max_sentence1_length': 668, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'kas_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 322764, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'kas_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 312468, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'kas_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 325455, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'kas_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 320866, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'kas_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 321944, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'kas_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 314440, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'kas_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 311286, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'kas_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 321952, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'kas_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 333910, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'kas_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'kas_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'kas_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'kas_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'kas_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'kas_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'kas_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'kas_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'kas_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'kas_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'kas_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'kas_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'kas_Arab-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 158.5, 'max_sentence1_length': 520, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mai_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 308682, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mai_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 298386, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mai_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 311373, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mai_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 306784, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mai_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 307862, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mai_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 300358, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mai_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 297204, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mai_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 307870, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mai_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 319828, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mai_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 310528, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mai_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mai_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mai_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mai_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mai_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mai_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mai_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mai_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mai_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mai_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mai_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mai_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 144.75, 'max_sentence1_length': 562, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mal_Mlym-asm_Beng': {'num_samples': 1024, 'number_of_characters': 343636, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mal_Mlym-ben_Beng': {'num_samples': 1024, 'number_of_characters': 333340, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mal_Mlym-brx_Deva': {'num_samples': 1024, 'number_of_characters': 346327, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mal_Mlym-doi_Deva': {'num_samples': 1024, 'number_of_characters': 341738, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mal_Mlym-eng_Latn': {'num_samples': 1024, 'number_of_characters': 342816, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mal_Mlym-gom_Deva': {'num_samples': 1024, 'number_of_characters': 335312, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mal_Mlym-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 332158, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mal_Mlym-hin_Deva': {'num_samples': 1024, 'number_of_characters': 342824, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mal_Mlym-kan_Knda': {'num_samples': 1024, 'number_of_characters': 354782, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mal_Mlym-kas_Arab': {'num_samples': 1024, 'number_of_characters': 345482, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mal_Mlym-mai_Deva': {'num_samples': 1024, 'number_of_characters': 331400, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mal_Mlym-mar_Deva': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mal_Mlym-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mal_Mlym-npi_Deva': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mal_Mlym-ory_Orya': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mal_Mlym-pan_Guru': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mal_Mlym-san_Deva': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mal_Mlym-sat_Olck': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mal_Mlym-snd_Deva': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mal_Mlym-tam_Taml': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mal_Mlym-tel_Telu': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mal_Mlym-urd_Arab': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 178.88, 'max_sentence1_length': 692, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mar_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 321784, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mar_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 311488, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mar_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 324475, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mar_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 319886, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mar_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 320964, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mar_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 313460, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mar_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 310306, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mar_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 320972, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mar_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 332930, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mar_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 323630, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mar_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 309548, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mar_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 344502, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mar_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'mar_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mar_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mar_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mar_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mar_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mar_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mar_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mar_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mar_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 157.54, 'max_sentence1_length': 555, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'mni_Mtei-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313134, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'mni_Mtei-ben_Beng': {'num_samples': 1024, 'number_of_characters': 302838, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'mni_Mtei-brx_Deva': {'num_samples': 1024, 'number_of_characters': 315825, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'mni_Mtei-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311236, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'mni_Mtei-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312314, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'mni_Mtei-gom_Deva': {'num_samples': 1024, 'number_of_characters': 304810, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'mni_Mtei-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301656, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'mni_Mtei-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312322, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'mni_Mtei-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324280, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'mni_Mtei-kas_Arab': {'num_samples': 1024, 'number_of_characters': 314980, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'mni_Mtei-mai_Deva': {'num_samples': 1024, 'number_of_characters': 300898, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'mni_Mtei-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 335852, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'mni_Mtei-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314000, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'mni_Mtei-npi_Deva': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'mni_Mtei-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'mni_Mtei-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'mni_Mtei-san_Deva': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'mni_Mtei-sat_Olck': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'mni_Mtei-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'mni_Mtei-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'mni_Mtei-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'mni_Mtei-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 16, 'average_sentence1_length': 149.1, 'max_sentence1_length': 597, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'npi_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 313419, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'npi_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 303123, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'npi_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 316110, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'npi_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 311521, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'npi_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 312599, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'npi_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 305095, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'npi_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 301941, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'npi_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 312607, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'npi_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 324565, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'npi_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 315265, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'npi_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 301183, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'npi_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 336137, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'npi_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 314285, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'npi_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 305635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'npi_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'npi_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'npi_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'npi_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'npi_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'npi_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'npi_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'npi_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 149.38, 'max_sentence1_length': 525, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'ory_Orya-asm_Beng': {'num_samples': 1024, 'number_of_characters': 334226, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'ory_Orya-ben_Beng': {'num_samples': 1024, 'number_of_characters': 323930, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'ory_Orya-brx_Deva': {'num_samples': 1024, 'number_of_characters': 336917, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'ory_Orya-doi_Deva': {'num_samples': 1024, 'number_of_characters': 332328, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'ory_Orya-eng_Latn': {'num_samples': 1024, 'number_of_characters': 333406, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'ory_Orya-gom_Deva': {'num_samples': 1024, 'number_of_characters': 325902, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'ory_Orya-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 322748, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'ory_Orya-hin_Deva': {'num_samples': 1024, 'number_of_characters': 333414, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'ory_Orya-kan_Knda': {'num_samples': 1024, 'number_of_characters': 345372, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'ory_Orya-kas_Arab': {'num_samples': 1024, 'number_of_characters': 336072, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'ory_Orya-mai_Deva': {'num_samples': 1024, 'number_of_characters': 321990, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'ory_Orya-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 356944, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'ory_Orya-mar_Deva': {'num_samples': 1024, 'number_of_characters': 335092, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'ory_Orya-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 326442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'ory_Orya-npi_Deva': {'num_samples': 1024, 'number_of_characters': 326727, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'ory_Orya-pan_Guru': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'ory_Orya-san_Deva': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'ory_Orya-sat_Olck': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'ory_Orya-snd_Deva': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'ory_Orya-tam_Taml': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'ory_Orya-tel_Telu': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'ory_Orya-urd_Arab': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 10, 'average_sentence1_length': 169.69, 'max_sentence1_length': 578, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'pan_Guru-asm_Beng': {'num_samples': 1024, 'number_of_characters': 306863, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'pan_Guru-ben_Beng': {'num_samples': 1024, 'number_of_characters': 296567, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'pan_Guru-brx_Deva': {'num_samples': 1024, 'number_of_characters': 309554, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'pan_Guru-doi_Deva': {'num_samples': 1024, 'number_of_characters': 304965, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'pan_Guru-eng_Latn': {'num_samples': 1024, 'number_of_characters': 306043, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'pan_Guru-gom_Deva': {'num_samples': 1024, 'number_of_characters': 298539, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'pan_Guru-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 295385, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'pan_Guru-hin_Deva': {'num_samples': 1024, 'number_of_characters': 306051, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'pan_Guru-kan_Knda': {'num_samples': 1024, 'number_of_characters': 318009, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'pan_Guru-kas_Arab': {'num_samples': 1024, 'number_of_characters': 308709, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'pan_Guru-mai_Deva': {'num_samples': 1024, 'number_of_characters': 294627, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'pan_Guru-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 329581, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'pan_Guru-mar_Deva': {'num_samples': 1024, 'number_of_characters': 307729, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'pan_Guru-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 299079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'pan_Guru-npi_Deva': {'num_samples': 1024, 'number_of_characters': 299364, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'pan_Guru-ory_Orya': {'num_samples': 1024, 'number_of_characters': 320171, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'pan_Guru-san_Deva': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'pan_Guru-sat_Olck': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'pan_Guru-snd_Deva': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'pan_Guru-tam_Taml': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'pan_Guru-tel_Telu': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'pan_Guru-urd_Arab': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 19, 'average_sentence1_length': 142.97, 'max_sentence1_length': 476, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'san_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 318079, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'san_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 307783, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'san_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 320770, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'san_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 316181, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'san_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 317259, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'san_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 309755, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'san_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 306601, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'san_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 317267, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'san_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 329225, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'san_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 319925, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'san_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 305843, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'san_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 340797, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'san_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 318945, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'san_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 310295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'san_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 310580, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'san_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 331387, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'san_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304024, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'san_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'san_Deva-snd_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'san_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'san_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'san_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 9, 'average_sentence1_length': 153.93, 'max_sentence1_length': 601, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'sat_Olck-asm_Beng': {'num_samples': 1024, 'number_of_characters': 326732, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'sat_Olck-ben_Beng': {'num_samples': 1024, 'number_of_characters': 316436, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'sat_Olck-brx_Deva': {'num_samples': 1024, 'number_of_characters': 329423, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'sat_Olck-doi_Deva': {'num_samples': 1024, 'number_of_characters': 324834, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'sat_Olck-eng_Latn': {'num_samples': 1024, 'number_of_characters': 325912, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'sat_Olck-gom_Deva': {'num_samples': 1024, 'number_of_characters': 318408, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'sat_Olck-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 315254, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'sat_Olck-hin_Deva': {'num_samples': 1024, 'number_of_characters': 325920, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'sat_Olck-kan_Knda': {'num_samples': 1024, 'number_of_characters': 337878, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'sat_Olck-kas_Arab': {'num_samples': 1024, 'number_of_characters': 328578, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'sat_Olck-mai_Deva': {'num_samples': 1024, 'number_of_characters': 314496, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'sat_Olck-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 349450, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'sat_Olck-mar_Deva': {'num_samples': 1024, 'number_of_characters': 327598, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'sat_Olck-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 318948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'sat_Olck-npi_Deva': {'num_samples': 1024, 'number_of_characters': 319233, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'sat_Olck-ory_Orya': {'num_samples': 1024, 'number_of_characters': 340040, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'sat_Olck-pan_Guru': {'num_samples': 1024, 'number_of_characters': 312677, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'sat_Olck-san_Deva': {'num_samples': 1024, 'number_of_characters': 323893, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'sat_Olck-snd_Deva': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'sat_Olck-tam_Taml': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'sat_Olck-tel_Telu': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'sat_Olck-urd_Arab': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 11, 'average_sentence1_length': 162.38, 'max_sentence1_length': 536, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'snd_Deva-asm_Beng': {'num_samples': 1024, 'number_of_characters': 320421, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'snd_Deva-ben_Beng': {'num_samples': 1024, 'number_of_characters': 310125, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'snd_Deva-brx_Deva': {'num_samples': 1024, 'number_of_characters': 323112, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'snd_Deva-doi_Deva': {'num_samples': 1024, 'number_of_characters': 318523, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'snd_Deva-eng_Latn': {'num_samples': 1024, 'number_of_characters': 319601, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'snd_Deva-gom_Deva': {'num_samples': 1024, 'number_of_characters': 312097, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'snd_Deva-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 308943, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'snd_Deva-hin_Deva': {'num_samples': 1024, 'number_of_characters': 319609, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'snd_Deva-kan_Knda': {'num_samples': 1024, 'number_of_characters': 331567, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'snd_Deva-kas_Arab': {'num_samples': 1024, 'number_of_characters': 322267, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'snd_Deva-mai_Deva': {'num_samples': 1024, 'number_of_characters': 308185, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'snd_Deva-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 343139, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'snd_Deva-mar_Deva': {'num_samples': 1024, 'number_of_characters': 321287, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'snd_Deva-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 312637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'snd_Deva-npi_Deva': {'num_samples': 1024, 'number_of_characters': 312922, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'snd_Deva-ory_Orya': {'num_samples': 1024, 'number_of_characters': 333729, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'snd_Deva-pan_Guru': {'num_samples': 1024, 'number_of_characters': 306366, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'snd_Deva-san_Deva': {'num_samples': 1024, 'number_of_characters': 317582, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'snd_Deva-sat_Olck': {'num_samples': 1024, 'number_of_characters': 326235, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'snd_Deva-tam_Taml': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'snd_Deva-tel_Telu': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'snd_Deva-urd_Arab': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 18, 'average_sentence1_length': 156.21, 'max_sentence1_length': 545, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tam_Taml-asm_Beng': {'num_samples': 1024, 'number_of_characters': 348346, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tam_Taml-ben_Beng': {'num_samples': 1024, 'number_of_characters': 338050, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tam_Taml-brx_Deva': {'num_samples': 1024, 'number_of_characters': 351037, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tam_Taml-doi_Deva': {'num_samples': 1024, 'number_of_characters': 346448, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tam_Taml-eng_Latn': {'num_samples': 1024, 'number_of_characters': 347526, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tam_Taml-gom_Deva': {'num_samples': 1024, 'number_of_characters': 340022, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tam_Taml-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 336868, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tam_Taml-hin_Deva': {'num_samples': 1024, 'number_of_characters': 347534, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tam_Taml-kan_Knda': {'num_samples': 1024, 'number_of_characters': 359492, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tam_Taml-kas_Arab': {'num_samples': 1024, 'number_of_characters': 350192, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tam_Taml-mai_Deva': {'num_samples': 1024, 'number_of_characters': 336110, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tam_Taml-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 371064, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tam_Taml-mar_Deva': {'num_samples': 1024, 'number_of_characters': 349212, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tam_Taml-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 340562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tam_Taml-npi_Deva': {'num_samples': 1024, 'number_of_characters': 340847, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tam_Taml-ory_Orya': {'num_samples': 1024, 'number_of_characters': 361654, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tam_Taml-pan_Guru': {'num_samples': 1024, 'number_of_characters': 334291, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tam_Taml-san_Deva': {'num_samples': 1024, 'number_of_characters': 345507, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tam_Taml-sat_Olck': {'num_samples': 1024, 'number_of_characters': 354160, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tam_Taml-snd_Deva': {'num_samples': 1024, 'number_of_characters': 347849, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tam_Taml-tel_Telu': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}, 'tam_Taml-urd_Arab': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 32, 'average_sentence1_length': 183.48, 'max_sentence1_length': 614, 'unique_sentence1': 1023, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'tel_Telu-asm_Beng': {'num_samples': 1024, 'number_of_characters': 319045, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'tel_Telu-ben_Beng': {'num_samples': 1024, 'number_of_characters': 308749, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'tel_Telu-brx_Deva': {'num_samples': 1024, 'number_of_characters': 321736, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'tel_Telu-doi_Deva': {'num_samples': 1024, 'number_of_characters': 317147, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'tel_Telu-eng_Latn': {'num_samples': 1024, 'number_of_characters': 318225, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'tel_Telu-gom_Deva': {'num_samples': 1024, 'number_of_characters': 310721, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'tel_Telu-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 307567, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'tel_Telu-hin_Deva': {'num_samples': 1024, 'number_of_characters': 318233, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'tel_Telu-kan_Knda': {'num_samples': 1024, 'number_of_characters': 330191, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'tel_Telu-kas_Arab': {'num_samples': 1024, 'number_of_characters': 320891, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'tel_Telu-mai_Deva': {'num_samples': 1024, 'number_of_characters': 306809, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'tel_Telu-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 341763, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'tel_Telu-mar_Deva': {'num_samples': 1024, 'number_of_characters': 319911, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'tel_Telu-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 311261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'tel_Telu-npi_Deva': {'num_samples': 1024, 'number_of_characters': 311546, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'tel_Telu-ory_Orya': {'num_samples': 1024, 'number_of_characters': 332353, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'tel_Telu-pan_Guru': {'num_samples': 1024, 'number_of_characters': 304990, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'tel_Telu-san_Deva': {'num_samples': 1024, 'number_of_characters': 316206, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'tel_Telu-sat_Olck': {'num_samples': 1024, 'number_of_characters': 324859, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'tel_Telu-snd_Deva': {'num_samples': 1024, 'number_of_characters': 318548, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'tel_Telu-tam_Taml': {'num_samples': 1024, 'number_of_characters': 346473, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'tel_Telu-urd_Arab': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 14, 'average_sentence1_length': 154.87, 'max_sentence1_length': 658, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 151.05, 'max_sentence2_length': 574, 'unique_sentence2': 1024}, 'urd_Arab-asm_Beng': {'num_samples': 1024, 'number_of_characters': 315134, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 156.7, 'max_sentence2_length': 582, 'unique_sentence2': 1024}, 'urd_Arab-ben_Beng': {'num_samples': 1024, 'number_of_characters': 304838, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 146.64, 'max_sentence2_length': 538, 'unique_sentence2': 1024}, 'urd_Arab-brx_Deva': {'num_samples': 1024, 'number_of_characters': 317825, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 159.33, 'max_sentence2_length': 631, 'unique_sentence2': 1024}, 'urd_Arab-doi_Deva': {'num_samples': 1024, 'number_of_characters': 313236, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.84, 'max_sentence2_length': 500, 'unique_sentence2': 1024}, 'urd_Arab-eng_Latn': {'num_samples': 1024, 'number_of_characters': 314314, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 155.9, 'max_sentence2_length': 532, 'unique_sentence2': 1024}, 'urd_Arab-gom_Deva': {'num_samples': 1024, 'number_of_characters': 306810, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 17, 'average_sentence2_length': 148.57, 'max_sentence2_length': 537, 'unique_sentence2': 1024}, 'urd_Arab-guj_Gujr': {'num_samples': 1024, 'number_of_characters': 303656, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 145.49, 'max_sentence2_length': 488, 'unique_sentence2': 1024}, 'urd_Arab-hin_Deva': {'num_samples': 1024, 'number_of_characters': 314322, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 21, 'average_sentence2_length': 155.91, 'max_sentence2_length': 531, 'unique_sentence2': 1024}, 'urd_Arab-kan_Knda': {'num_samples': 1024, 'number_of_characters': 326280, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 167.58, 'max_sentence2_length': 668, 'unique_sentence2': 1024}, 'urd_Arab-kas_Arab': {'num_samples': 1024, 'number_of_characters': 316980, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 158.5, 'max_sentence2_length': 520, 'unique_sentence2': 1024}, 'urd_Arab-mai_Deva': {'num_samples': 1024, 'number_of_characters': 302898, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 144.75, 'max_sentence2_length': 562, 'unique_sentence2': 1024}, 'urd_Arab-mal_Mlym': {'num_samples': 1024, 'number_of_characters': 337852, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 13, 'average_sentence2_length': 178.88, 'max_sentence2_length': 692, 'unique_sentence2': 1024}, 'urd_Arab-mar_Deva': {'num_samples': 1024, 'number_of_characters': 316000, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 157.54, 'max_sentence2_length': 555, 'unique_sentence2': 1024}, 'urd_Arab-mni_Mtei': {'num_samples': 1024, 'number_of_characters': 307350, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 16, 'average_sentence2_length': 149.1, 'max_sentence2_length': 597, 'unique_sentence2': 1024}, 'urd_Arab-npi_Deva': {'num_samples': 1024, 'number_of_characters': 307635, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 149.38, 'max_sentence2_length': 525, 'unique_sentence2': 1024}, 'urd_Arab-ory_Orya': {'num_samples': 1024, 'number_of_characters': 328442, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 10, 'average_sentence2_length': 169.69, 'max_sentence2_length': 578, 'unique_sentence2': 1024}, 'urd_Arab-pan_Guru': {'num_samples': 1024, 'number_of_characters': 301079, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 19, 'average_sentence2_length': 142.97, 'max_sentence2_length': 476, 'unique_sentence2': 1024}, 'urd_Arab-san_Deva': {'num_samples': 1024, 'number_of_characters': 312295, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 9, 'average_sentence2_length': 153.93, 'max_sentence2_length': 601, 'unique_sentence2': 1024}, 'urd_Arab-sat_Olck': {'num_samples': 1024, 'number_of_characters': 320948, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 11, 'average_sentence2_length': 162.38, 'max_sentence2_length': 536, 'unique_sentence2': 1024}, 'urd_Arab-snd_Deva': {'num_samples': 1024, 'number_of_characters': 314637, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 18, 'average_sentence2_length': 156.21, 'max_sentence2_length': 545, 'unique_sentence2': 1024}, 'urd_Arab-tam_Taml': {'num_samples': 1024, 'number_of_characters': 342562, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 32, 'average_sentence2_length': 183.48, 'max_sentence2_length': 614, 'unique_sentence2': 1023}, 'urd_Arab-tel_Telu': {'num_samples': 1024, 'number_of_characters': 313261, 'unique_pairs': 1024, 'min_sentence1_length': 13, 'average_sentence1_length': 151.05, 'max_sentence1_length': 574, 'unique_sentence1': 1024, 'min_sentence2_length': 14, 'average_sentence2_length': 154.87, 'max_sentence2_length': 658, 'unique_sentence2': 1024}}}} | +| [IWSLT2017BitextMining](https://aclanthology.org/2017.iwslt-1.1/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'jpn', 'kor', 'nld', 'ron'] | BitextMining | s2s | [Fiction, Non-fiction, Written] | {'validation': 21938} | {'validation': {'num_samples': 21938, 'number_of_characters': 4256244, 'unique_pairs': 21840, 'min_sentence1_length': 2, 'average_sentence1_length': 97.01, 'max_sentence1_length': 521, 'unique_sentence1': 11563, 'min_sentence2_length': 2, 'average_sentence2_length': 97.01, 'max_sentence2_length': 521, 'unique_sentence2': 11563, 'hf_subset_descriptive_stats': {'ar-en': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 4, 'average_sentence1_length': 85.49, 'max_sentence1_length': 369, 'unique_sentence1': 887, 'min_sentence2_length': 10, 'average_sentence2_length': 108.77, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'de-en': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 119.03, 'max_sentence1_length': 521, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.83, 'max_sentence2_length': 462, 'unique_sentence2': 881}, 'en-ar': {'num_samples': 888, 'number_of_characters': 172499, 'unique_pairs': 887, 'min_sentence1_length': 10, 'average_sentence1_length': 108.77, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 4, 'average_sentence2_length': 85.49, 'max_sentence2_length': 369, 'unique_sentence2': 887}, 'en-de': {'num_samples': 888, 'number_of_characters': 202336, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.83, 'max_sentence1_length': 462, 'unique_sentence1': 881, 'min_sentence2_length': 6, 'average_sentence2_length': 119.03, 'max_sentence2_length': 521, 'unique_sentence2': 881}, 'en-fr': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 10, 'average_sentence1_length': 108.41, 'max_sentence1_length': 462, 'unique_sentence1': 883, 'min_sentence2_length': 6, 'average_sentence2_length': 113.63, 'max_sentence2_length': 493, 'unique_sentence2': 881}, 'en-it': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 10, 'average_sentence1_length': 103.0, 'max_sentence1_length': 433, 'unique_sentence1': 922, 'min_sentence2_length': 7, 'average_sentence2_length': 103.46, 'max_sentence2_length': 444, 'unique_sentence2': 918}, 'en-ja': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 10, 'average_sentence1_length': 109.81, 'max_sentence1_length': 462, 'unique_sentence1': 864, 'min_sentence2_length': 5, 'average_sentence2_length': 42.59, 'max_sentence2_length': 225, 'unique_sentence2': 866}, 'en-ko': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 10, 'average_sentence1_length': 107.74, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 3, 'average_sentence2_length': 54.56, 'max_sentence2_length': 250, 'unique_sentence2': 872}, 'en-nl': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 10, 'average_sentence1_length': 95.27, 'max_sentence1_length': 433, 'unique_sentence1': 996, 'min_sentence2_length': 4, 'average_sentence2_length': 93.8, 'max_sentence2_length': 477, 'unique_sentence2': 1000}, 'en-ro': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 10, 'average_sentence1_length': 104.72, 'max_sentence1_length': 433, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.67, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'en-zh': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 10, 'average_sentence1_length': 109.37, 'max_sentence1_length': 462, 'unique_sentence1': 872, 'min_sentence2_length': 2, 'average_sentence2_length': 39.81, 'max_sentence2_length': 230, 'unique_sentence2': 867}, 'fr-en': {'num_samples': 890, 'number_of_characters': 197619, 'unique_pairs': 883, 'min_sentence1_length': 6, 'average_sentence1_length': 113.63, 'max_sentence1_length': 493, 'unique_sentence1': 881, 'min_sentence2_length': 10, 'average_sentence2_length': 108.41, 'max_sentence2_length': 462, 'unique_sentence2': 883}, 'it-en': {'num_samples': 929, 'number_of_characters': 191803, 'unique_pairs': 924, 'min_sentence1_length': 7, 'average_sentence1_length': 103.46, 'max_sentence1_length': 444, 'unique_sentence1': 918, 'min_sentence2_length': 10, 'average_sentence2_length': 103.0, 'max_sentence2_length': 433, 'unique_sentence2': 922}, 'it-nl': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.64, 'max_sentence1_length': 459, 'unique_sentence1': 994, 'min_sentence2_length': 7, 'average_sentence2_length': 94.03, 'max_sentence2_length': 505, 'unique_sentence2': 998}, 'it-ro': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 103.91, 'max_sentence1_length': 435, 'unique_sentence1': 907, 'min_sentence2_length': 9, 'average_sentence2_length': 107.62, 'max_sentence2_length': 448, 'unique_sentence2': 910}, 'ja-en': {'num_samples': 871, 'number_of_characters': 132742, 'unique_pairs': 867, 'min_sentence1_length': 5, 'average_sentence1_length': 42.59, 'max_sentence1_length': 225, 'unique_sentence1': 866, 'min_sentence2_length': 10, 'average_sentence2_length': 109.81, 'max_sentence2_length': 462, 'unique_sentence2': 864}, 'ko-en': {'num_samples': 879, 'number_of_characters': 142659, 'unique_pairs': 874, 'min_sentence1_length': 3, 'average_sentence1_length': 54.56, 'max_sentence1_length': 250, 'unique_sentence1': 872, 'min_sentence2_length': 10, 'average_sentence2_length': 107.74, 'max_sentence2_length': 462, 'unique_sentence2': 872}, 'nl-en': {'num_samples': 1003, 'number_of_characters': 189637, 'unique_pairs': 1000, 'min_sentence1_length': 4, 'average_sentence1_length': 93.8, 'max_sentence1_length': 477, 'unique_sentence1': 1000, 'min_sentence2_length': 10, 'average_sentence2_length': 95.27, 'max_sentence2_length': 433, 'unique_sentence2': 996}, 'nl-it': {'num_samples': 1001, 'number_of_characters': 188858, 'unique_pairs': 998, 'min_sentence1_length': 7, 'average_sentence1_length': 94.03, 'max_sentence1_length': 505, 'unique_sentence1': 998, 'min_sentence2_length': 7, 'average_sentence2_length': 94.64, 'max_sentence2_length': 459, 'unique_sentence2': 994}, 'nl-ro': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 7, 'average_sentence1_length': 102.02, 'max_sentence1_length': 478, 'unique_sentence1': 909, 'min_sentence2_length': 9, 'average_sentence2_length': 107.59, 'max_sentence2_length': 515, 'unique_sentence2': 909}, 'ro-en': {'num_samples': 914, 'number_of_characters': 194128, 'unique_pairs': 910, 'min_sentence1_length': 9, 'average_sentence1_length': 107.67, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 10, 'average_sentence2_length': 104.72, 'max_sentence2_length': 433, 'unique_sentence2': 907}, 'ro-it': {'num_samples': 914, 'number_of_characters': 193339, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.62, 'max_sentence1_length': 448, 'unique_sentence1': 910, 'min_sentence2_length': 7, 'average_sentence2_length': 103.91, 'max_sentence2_length': 435, 'unique_sentence2': 907}, 'ro-nl': {'num_samples': 913, 'number_of_characters': 191376, 'unique_pairs': 911, 'min_sentence1_length': 9, 'average_sentence1_length': 107.59, 'max_sentence1_length': 515, 'unique_sentence1': 909, 'min_sentence2_length': 7, 'average_sentence2_length': 102.02, 'max_sentence2_length': 478, 'unique_sentence2': 909}, 'zh-en': {'num_samples': 879, 'number_of_characters': 131126, 'unique_pairs': 877, 'min_sentence1_length': 2, 'average_sentence1_length': 39.81, 'max_sentence1_length': 230, 'unique_sentence1': 867, 'min_sentence2_length': 10, 'average_sentence2_length': 109.37, 'max_sentence2_length': 462, 'unique_sentence2': 872}}}} | | [ImdbClassification](http://www.aclweb.org/anthology/P11-1015) | ['eng'] | Classification | p2p | [Reviews, Written] | None | None | -| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Web, Social, Written] | None | None | -| [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [News, Non-fiction, Web, Spoken, Government, Written, Spoken] | None | None | -| [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [Web, News, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | -| [IndicLangClassification](https://arxiv.org/abs/2305.15814) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Web, Non-fiction, Written] | None | None | +| [InappropriatenessClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | Classification | s2s | [Social, Web, Written] | None | None | +| [IndicCrosslingualSTS](https://huggingface.co/datasets/jaygala24/indic_sts) (Ramesh et al., 2022) | ['asm', 'ben', 'eng', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | STS | s2s | [Government, News, Non-fiction, Spoken, Spoken, Web, Written] | None | None | +| [IndicGenBenchFloresBitextMining](https://github.com/google-research-datasets/indic-gen-bench/) (Harman Singh, 2024) | ['asm', 'awa', 'ben', 'bgc', 'bho', 'bod', 'boy', 'eng', 'gbm', 'gom', 'guj', 'hin', 'hne', 'kan', 'mai', 'mal', 'mar', 'mni', 'mup', 'mwr', 'nep', 'ory', 'pan', 'pus', 'raj', 'san', 'sat', 'tam', 'tel', 'urd'] | BitextMining | s2s | [News, Web, Written] | {'validation': 57826, 'test': 58696} | {'validation': {'num_samples': 57826, 'number_of_characters': 14600950, 'unique_pairs': 57826, 'min_sentence1_length': 24, 'average_sentence1_length': 126.25, 'max_sentence1_length': 368, 'unique_sentence1': 29903, 'min_sentence2_length': 24, 'average_sentence2_length': 126.24, 'max_sentence2_length': 368, 'unique_sentence2': 29903, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 123.65, 'max_sentence1_length': 320, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ben': {'num_samples': 997, 'number_of_characters': 248469, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 123.65, 'max_sentence2_length': 320, 'unique_sentence2': 997}, 'guj-eng': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 120.64, 'max_sentence1_length': 368, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-guj': {'num_samples': 997, 'number_of_characters': 245477, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 120.64, 'max_sentence2_length': 368, 'unique_sentence2': 997}, 'hin-eng': {'num_samples': 997, 'number_of_characters': 250573, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 125.76, 'max_sentence1_length': 355, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hin': {'num_samples': 997, 'number_of_characters': 250564, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 125.75, 'max_sentence2_length': 355, 'unique_sentence2': 997}, 'kan-eng': {'num_samples': 997, 'number_of_characters': 257131, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 132.33, 'max_sentence1_length': 331, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-kan': {'num_samples': 997, 'number_of_characters': 256986, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 132.19, 'max_sentence2_length': 331, 'unique_sentence2': 997}, 'mal-eng': {'num_samples': 997, 'number_of_characters': 267295, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 142.53, 'max_sentence1_length': 360, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mal': {'num_samples': 997, 'number_of_characters': 267296, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 142.53, 'max_sentence2_length': 360, 'unique_sentence2': 997}, 'mar-eng': {'num_samples': 997, 'number_of_characters': 251107, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.29, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mar': {'num_samples': 997, 'number_of_characters': 250897, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.08, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'tam-eng': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 146.57, 'max_sentence1_length': 358, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tam': {'num_samples': 997, 'number_of_characters': 271322, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 146.57, 'max_sentence2_length': 358, 'unique_sentence2': 997}, 'tel-eng': {'num_samples': 997, 'number_of_characters': 252385, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 127.57, 'max_sentence1_length': 317, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-tel': {'num_samples': 997, 'number_of_characters': 252380, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 127.57, 'max_sentence2_length': 317, 'unique_sentence2': 997}, 'urd-eng': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 125.01, 'max_sentence1_length': 295, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-urd': {'num_samples': 997, 'number_of_characters': 249824, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 125.01, 'max_sentence2_length': 295, 'unique_sentence2': 996}, 'asm-eng': {'num_samples': 997, 'number_of_characters': 246220, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 121.39, 'max_sentence1_length': 314, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-asm': {'num_samples': 997, 'number_of_characters': 246224, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 121.39, 'max_sentence2_length': 314, 'unique_sentence2': 997}, 'bho-eng': {'num_samples': 997, 'number_of_characters': 246895, 'unique_pairs': 997, 'min_sentence1_length': 25, 'average_sentence1_length': 122.07, 'max_sentence1_length': 326, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bho': {'num_samples': 997, 'number_of_characters': 246919, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 25, 'average_sentence2_length': 122.09, 'max_sentence2_length': 326, 'unique_sentence2': 997}, 'nep-eng': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 24, 'average_sentence1_length': 121.15, 'max_sentence1_length': 307, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-nep': {'num_samples': 997, 'number_of_characters': 245984, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 24, 'average_sentence2_length': 121.15, 'max_sentence2_length': 307, 'unique_sentence2': 997}, 'ory-eng': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 129.4, 'max_sentence1_length': 308, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-ory': {'num_samples': 997, 'number_of_characters': 254206, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 129.4, 'max_sentence2_length': 308, 'unique_sentence2': 997}, 'pan-eng': {'num_samples': 997, 'number_of_characters': 251598, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 126.78, 'max_sentence1_length': 309, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pan': {'num_samples': 997, 'number_of_characters': 251597, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 126.78, 'max_sentence2_length': 309, 'unique_sentence2': 997}, 'pus-eng': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 122.62, 'max_sentence1_length': 300, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-pus': {'num_samples': 997, 'number_of_characters': 247450, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 122.62, 'max_sentence2_length': 300, 'unique_sentence2': 997}, 'san-eng': {'num_samples': 997, 'number_of_characters': 249042, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 124.22, 'max_sentence1_length': 311, 'unique_sentence1': 994, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-san': {'num_samples': 997, 'number_of_characters': 248877, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 124.06, 'max_sentence2_length': 311, 'unique_sentence2': 994}, 'awa-eng': {'num_samples': 997, 'number_of_characters': 247944, 'unique_pairs': 997, 'min_sentence1_length': 34, 'average_sentence1_length': 123.12, 'max_sentence1_length': 329, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-awa': {'num_samples': 997, 'number_of_characters': 247884, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 34, 'average_sentence2_length': 123.06, 'max_sentence2_length': 329, 'unique_sentence2': 997}, 'bgc-eng': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 27, 'average_sentence1_length': 121.1, 'max_sentence1_length': 303, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bgc': {'num_samples': 997, 'number_of_characters': 245935, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 27, 'average_sentence2_length': 121.1, 'max_sentence2_length': 303, 'unique_sentence2': 997}, 'bod-eng': {'num_samples': 997, 'number_of_characters': 266515, 'unique_pairs': 997, 'min_sentence1_length': 26, 'average_sentence1_length': 141.75, 'max_sentence1_length': 355, 'unique_sentence1': 996, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-bod': {'num_samples': 997, 'number_of_characters': 266495, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 26, 'average_sentence2_length': 141.73, 'max_sentence2_length': 355, 'unique_sentence2': 996}, 'boy-eng': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 135.39, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-boy': {'num_samples': 997, 'number_of_characters': 260174, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 135.39, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'gbm-eng': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 30, 'average_sentence1_length': 122.18, 'max_sentence1_length': 344, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gbm': {'num_samples': 997, 'number_of_characters': 247009, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 30, 'average_sentence2_length': 122.18, 'max_sentence2_length': 344, 'unique_sentence2': 997}, 'gom-eng': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 119.72, 'max_sentence1_length': 306, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-gom': {'num_samples': 997, 'number_of_characters': 244553, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 119.72, 'max_sentence2_length': 306, 'unique_sentence2': 997}, 'hne-eng': {'num_samples': 997, 'number_of_characters': 246416, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 121.59, 'max_sentence1_length': 321, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-hne': {'num_samples': 997, 'number_of_characters': 246405, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 121.58, 'max_sentence2_length': 321, 'unique_sentence2': 997}, 'raj-eng': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 32, 'average_sentence1_length': 124.72, 'max_sentence1_length': 313, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-raj': {'num_samples': 997, 'number_of_characters': 249541, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 32, 'average_sentence2_length': 124.72, 'max_sentence2_length': 313, 'unique_sentence2': 997}, 'mai-eng': {'num_samples': 997, 'number_of_characters': 247991, 'unique_pairs': 997, 'min_sentence1_length': 29, 'average_sentence1_length': 123.17, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mai': {'num_samples': 997, 'number_of_characters': 247994, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 29, 'average_sentence2_length': 123.17, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mni-eng': {'num_samples': 997, 'number_of_characters': 254308, 'unique_pairs': 997, 'min_sentence1_length': 39, 'average_sentence1_length': 129.5, 'max_sentence1_length': 310, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mni': {'num_samples': 997, 'number_of_characters': 254312, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 39, 'average_sentence2_length': 129.51, 'max_sentence2_length': 310, 'unique_sentence2': 997}, 'mup-eng': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 123.66, 'max_sentence1_length': 312, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mup': {'num_samples': 997, 'number_of_characters': 248486, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 123.66, 'max_sentence2_length': 312, 'unique_sentence2': 997}, 'mwr-eng': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 31, 'average_sentence1_length': 123.82, 'max_sentence1_length': 324, 'unique_sentence1': 997, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-mwr': {'num_samples': 997, 'number_of_characters': 248641, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 31, 'average_sentence2_length': 123.82, 'max_sentence2_length': 324, 'unique_sentence2': 997}, 'sat-eng': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 37, 'average_sentence1_length': 133.49, 'max_sentence1_length': 333, 'unique_sentence1': 995, 'min_sentence2_length': 28, 'average_sentence2_length': 125.57, 'max_sentence2_length': 297, 'unique_sentence2': 997}, 'eng-sat': {'num_samples': 997, 'number_of_characters': 258279, 'unique_pairs': 997, 'min_sentence1_length': 28, 'average_sentence1_length': 125.57, 'max_sentence1_length': 297, 'unique_sentence1': 997, 'min_sentence2_length': 37, 'average_sentence2_length': 133.49, 'max_sentence2_length': 333, 'unique_sentence2': 995}}}, 'test': {'num_samples': 58696, 'number_of_characters': 15359416, 'unique_pairs': 58690, 'min_sentence1_length': 33, 'average_sentence1_length': 130.84, 'max_sentence1_length': 431, 'unique_sentence1': 30351, 'min_sentence2_length': 33, 'average_sentence2_length': 130.83, 'max_sentence2_length': 431, 'unique_sentence2': 30351, 'hf_subset_descriptive_stats': {'ben-eng': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.51, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ben': {'num_samples': 1012, 'number_of_characters': 261008, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.51, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'guj-eng': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.93, 'max_sentence1_length': 349, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-guj': {'num_samples': 1012, 'number_of_characters': 258394, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.93, 'max_sentence2_length': 349, 'unique_sentence2': 1012}, 'hin-eng': {'num_samples': 1012, 'number_of_characters': 263040, 'unique_pairs': 1012, 'min_sentence1_length': 41, 'average_sentence1_length': 129.52, 'max_sentence1_length': 381, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hin': {'num_samples': 1012, 'number_of_characters': 263029, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 41, 'average_sentence2_length': 129.51, 'max_sentence2_length': 381, 'unique_sentence2': 1012}, 'kan-eng': {'num_samples': 1012, 'number_of_characters': 270091, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 136.49, 'max_sentence1_length': 388, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-kan': {'num_samples': 1012, 'number_of_characters': 270021, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 136.42, 'max_sentence2_length': 388, 'unique_sentence2': 1012}, 'mal-eng': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 147.57, 'max_sentence1_length': 376, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mal': {'num_samples': 1012, 'number_of_characters': 281302, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 147.57, 'max_sentence2_length': 376, 'unique_sentence2': 1012}, 'mar-eng': {'num_samples': 1012, 'number_of_characters': 265212, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 131.67, 'max_sentence1_length': 356, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mar': {'num_samples': 1012, 'number_of_characters': 265023, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 131.48, 'max_sentence2_length': 355, 'unique_sentence2': 1012}, 'tam-eng': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 48, 'average_sentence1_length': 152.31, 'max_sentence1_length': 404, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tam': {'num_samples': 1012, 'number_of_characters': 286099, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 48, 'average_sentence2_length': 152.31, 'max_sentence2_length': 404, 'unique_sentence2': 1012}, 'tel-eng': {'num_samples': 1012, 'number_of_characters': 264460, 'unique_pairs': 1012, 'min_sentence1_length': 39, 'average_sentence1_length': 130.92, 'max_sentence1_length': 359, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-tel': {'num_samples': 1012, 'number_of_characters': 264447, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 39, 'average_sentence2_length': 130.91, 'max_sentence2_length': 359, 'unique_sentence2': 1012}, 'urd-eng': {'num_samples': 1012, 'number_of_characters': 261886, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 128.38, 'max_sentence1_length': 348, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-urd': {'num_samples': 1012, 'number_of_characters': 261885, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 128.38, 'max_sentence2_length': 348, 'unique_sentence2': 1012}, 'asm-eng': {'num_samples': 1012, 'number_of_characters': 257902, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 124.44, 'max_sentence1_length': 329, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-asm': {'num_samples': 1012, 'number_of_characters': 257909, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 124.45, 'max_sentence2_length': 329, 'unique_sentence2': 1012}, 'bho-eng': {'num_samples': 1012, 'number_of_characters': 260578, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.09, 'max_sentence1_length': 367, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bho': {'num_samples': 1012, 'number_of_characters': 260601, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.11, 'max_sentence2_length': 367, 'unique_sentence2': 1012}, 'nep-eng': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 125.4, 'max_sentence1_length': 362, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-nep': {'num_samples': 1012, 'number_of_characters': 258869, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 125.4, 'max_sentence2_length': 362, 'unique_sentence2': 1012}, 'ory-eng': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 133.24, 'max_sentence1_length': 354, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-ory': {'num_samples': 1012, 'number_of_characters': 266805, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 133.24, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'pan-eng': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 131.84, 'max_sentence1_length': 380, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pan': {'num_samples': 1012, 'number_of_characters': 265391, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 131.84, 'max_sentence2_length': 380, 'unique_sentence2': 1012}, 'pus-eng': {'num_samples': 1012, 'number_of_characters': 254422, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 121.0, 'max_sentence1_length': 325, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-pus': {'num_samples': 1012, 'number_of_characters': 254421, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 121.0, 'max_sentence2_length': 325, 'unique_sentence2': 1012}, 'san-eng': {'num_samples': 1012, 'number_of_characters': 260339, 'unique_pairs': 1012, 'min_sentence1_length': 33, 'average_sentence1_length': 126.85, 'max_sentence1_length': 358, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-san': {'num_samples': 1012, 'number_of_characters': 260224, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 33, 'average_sentence2_length': 126.74, 'max_sentence2_length': 358, 'unique_sentence2': 1011}, 'awa-eng': {'num_samples': 1012, 'number_of_characters': 260179, 'unique_pairs': 1012, 'min_sentence1_length': 34, 'average_sentence1_length': 126.69, 'max_sentence1_length': 378, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-awa': {'num_samples': 1012, 'number_of_characters': 260137, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 34, 'average_sentence2_length': 126.65, 'max_sentence2_length': 378, 'unique_sentence2': 1012}, 'bgc-eng': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 124.0, 'max_sentence1_length': 332, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bgc': {'num_samples': 1012, 'number_of_characters': 257450, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 124.0, 'max_sentence2_length': 332, 'unique_sentence2': 1012}, 'bod-eng': {'num_samples': 1012, 'number_of_characters': 280188, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 146.46, 'max_sentence1_length': 431, 'unique_sentence1': 1009, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-bod': {'num_samples': 1012, 'number_of_characters': 280126, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 146.4, 'max_sentence2_length': 431, 'unique_sentence2': 1009}, 'boy-eng': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 143.85, 'max_sentence1_length': 396, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-boy': {'num_samples': 1012, 'number_of_characters': 277538, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 143.85, 'max_sentence2_length': 396, 'unique_sentence2': 1011}, 'gbm-eng': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 127.53, 'max_sentence1_length': 333, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gbm': {'num_samples': 1012, 'number_of_characters': 261027, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 127.53, 'max_sentence2_length': 333, 'unique_sentence2': 1012}, 'gom-eng': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 37, 'average_sentence1_length': 125.71, 'max_sentence1_length': 335, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-gom': {'num_samples': 1012, 'number_of_characters': 259182, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 37, 'average_sentence2_length': 125.71, 'max_sentence2_length': 335, 'unique_sentence2': 1012}, 'hne-eng': {'num_samples': 1012, 'number_of_characters': 258911, 'unique_pairs': 1012, 'min_sentence1_length': 42, 'average_sentence1_length': 125.44, 'max_sentence1_length': 327, 'unique_sentence1': 1011, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-hne': {'num_samples': 1012, 'number_of_characters': 258915, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 42, 'average_sentence2_length': 125.44, 'max_sentence2_length': 326, 'unique_sentence2': 1011}, 'raj-eng': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 128.48, 'max_sentence1_length': 338, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-raj': {'num_samples': 1012, 'number_of_characters': 261987, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 128.48, 'max_sentence2_length': 338, 'unique_sentence2': 1012}, 'mai-eng': {'num_samples': 1012, 'number_of_characters': 261374, 'unique_pairs': 1012, 'min_sentence1_length': 36, 'average_sentence1_length': 127.87, 'max_sentence1_length': 350, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mai': {'num_samples': 1012, 'number_of_characters': 261377, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 36, 'average_sentence2_length': 127.88, 'max_sentence2_length': 350, 'unique_sentence2': 1012}, 'mni-eng': {'num_samples': 1012, 'number_of_characters': 268767, 'unique_pairs': 1012, 'min_sentence1_length': 38, 'average_sentence1_length': 135.18, 'max_sentence1_length': 353, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mni': {'num_samples': 1012, 'number_of_characters': 268768, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 38, 'average_sentence2_length': 135.18, 'max_sentence2_length': 354, 'unique_sentence2': 1012}, 'mup-eng': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 40, 'average_sentence1_length': 128.53, 'max_sentence1_length': 340, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mup': {'num_samples': 1012, 'number_of_characters': 262034, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 40, 'average_sentence2_length': 128.53, 'max_sentence2_length': 340, 'unique_sentence2': 1012}, 'mwr-eng': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.22, 'max_sentence1_length': 345, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-mwr': {'num_samples': 1012, 'number_of_characters': 263749, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.22, 'max_sentence2_length': 345, 'unique_sentence2': 1012}, 'sat-eng': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 43, 'average_sentence1_length': 138.13, 'max_sentence1_length': 366, 'unique_sentence1': 1012, 'min_sentence2_length': 35, 'average_sentence2_length': 130.4, 'max_sentence2_length': 368, 'unique_sentence2': 1012}, 'eng-sat': {'num_samples': 1012, 'number_of_characters': 271757, 'unique_pairs': 1012, 'min_sentence1_length': 35, 'average_sentence1_length': 130.4, 'max_sentence1_length': 368, 'unique_sentence1': 1012, 'min_sentence2_length': 43, 'average_sentence2_length': 138.13, 'max_sentence2_length': 366, 'unique_sentence2': 1012}}}} | +| [IndicLangClassification](https://arxiv.org/abs/2305.15814) | ['asm', 'ben', 'brx', 'doi', 'gom', 'guj', 'hin', 'kan', 'kas', 'mai', 'mal', 'mar', 'mni', 'npi', 'ory', 'pan', 'san', 'sat', 'snd', 'tam', 'tel', 'urd'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [IndicNLPNewsClassification](https://github.com/AI4Bharat/indicnlp_corpus#indicnlp-news-article-classification-dataset) (Anoop Kunchukuttan, 2020) | ['guj', 'kan', 'mal', 'mar', 'ori', 'pan', 'tam', 'tel'] | Classification | s2s | [News, Written] | None | None | | [IndicQARetrieval](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel'] | Retrieval | s2p | [Web, Written] | None | None | | [IndicReviewsClusteringP2P](https://arxiv.org/abs/2212.05409) (Sumanth Doddapaneni, 2022) | ['asm', 'ben', 'brx', 'guj', 'hin', 'kan', 'mal', 'mar', 'ory', 'pan', 'tam', 'tel', 'urd'] | Clustering | p2p | [Reviews, Written] | None | None | @@ -278,7 +278,7 @@ The following tables give you an overview of the tasks in MTEB. | [InsurancePolicyInterpretationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [InternationalCitizenshipQuestionsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [IsiZuluNewsClassification](https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news) (Madodonga et al., 2023) | ['zul'] | Classification | s2s | [News, Written] | None | None | -| [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Legal, Government, Written] | None | None | +| [ItaCaseholdClassification](https://doi.org/10.1145/3594536.3595177) (Licari et al., 2023) | ['ita'] | Classification | s2s | [Government, Legal, Written] | None | None | | [Itacola](https://aclanthology.org/2021.findings-emnlp.250/) | ['ita'] | Classification | s2s | [Non-fiction, Spoken, Written] | None | None | | [JCrewBlockerLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [JDReview](https://aclanthology.org/2023.nodalida-1.20/) (Xiao et al., 2023) | ['cmn'] | Classification | s2s | | None | None | @@ -288,13 +288,13 @@ The following tables give you an overview of the tasks in MTEB. | [JaQuADRetrieval](https://arxiv.org/abs/2202.01764) (ByungHoon So, 2022) | ['jpn'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [JaqketRetrieval](https://github.com/kumapo/JAQKET-dataset) | ['jpn'] | Retrieval | s2p | [Encyclopaedic, Non-fiction, Written] | {'test': 115226} | {'test': {'number_of_characters': 428294530, 'num_samples': 115226, 'num_queries': 997, 'num_documents': 114229, 'min_document_length': 16, 'average_document_length': 0.44, 'max_document_length': 98, 'unique_documents': 114229, 'min_query_length': 8, 'average_query_length': 429532.57, 'max_query_length': 188424, 'unique_queries': 997, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 989}} | | [JavaneseIMDBClassification](https://github.com/w11wo/nlp-datasets#javanese-imdb) (Wongso et al., 2021) | ['jav'] | Classification | s2s | [Reviews, Written] | None | None | -| [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [News, Encyclopaedic, Written] | None | None | -| [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [Reviews, News, Spoken, Written, Spoken] | None | None | +| [KLUE-NLI](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | PairClassification | s2s | [Encyclopaedic, News, Written] | None | None | +| [KLUE-STS](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | STS | s2s | [News, Reviews, Spoken, Spoken, Written] | None | None | | [KLUE-TC](https://arxiv.org/abs/2105.09680) (Sungjoon Park, 2021) | ['kor'] | Classification | s2s | [News, Written] | None | None | | [KannadaNewsClassification](https://github.com/goru001/nlp-for-kannada) (Anoop Kunchukuttan, 2020) | ['kan'] | Classification | s2s | [News, Written] | None | None | | [KinopoiskClassification](https://www.dialog-21.ru/media/1226/blinovpd.pdf) (Blinov et al., 2013) | ['rus'] | Classification | p2p | [Reviews, Written] | None | None | | Ko-StrategyQA (Geva et al., 2021) | ['kor'] | Retrieval | s2p | | None | None | -| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [News, Written, Financial] | None | None | +| [KorFin](https://huggingface.co/datasets/amphora/korfin-asc) (Son et al., 2023) | ['kor'] | Classification | s2s | [Financial, News, Written] | None | None | | [KorHateClassification](https://paperswithcode.com/dataset/korean-hatespeech-dataset) (Jihyung Moon, 2020) | ['kor'] | Classification | s2s | [Social, Written] | None | None | | [KorHateSpeechMLClassification](https://paperswithcode.com/dataset/korean-multi-label-hate-speech-dataset) | ['kor'] | MultilabelClassification | s2s | [Social, Written] | None | None | | [KorSTS](https://arxiv.org/abs/2004.03289) (Ham et al., 2020) | ['kor'] | STS | s2s | [News, Web] | None | None | @@ -307,7 +307,7 @@ The following tables give you an overview of the tasks in MTEB. | [LEMBQMSumRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | | [LEMBSummScreenFDRetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) | ['eng'] | Retrieval | s2p | [Spoken, Written] | None | None | | [LEMBWikimQARetrieval](https://huggingface.co/datasets/dwzhu/LongEmbed) (Ho et al., 2020) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Reviews, Web, Non-fiction, Fiction, Government, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | +| [LanguageClassification](https://huggingface.co/datasets/papluca/language-identification) (Conneau et al., 2018) | ['ara', 'bul', 'cmn', 'deu', 'ell', 'eng', 'fra', 'hin', 'ita', 'jpn', 'nld', 'pol', 'por', 'rus', 'spa', 'swa', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Fiction, Government, Non-fiction, Reviews, Web, Written] | {'test': 2048, 'train': 70000} | {'test': {'num_samples': 2048, 'number_of_characters': 224352, 'num_texts_in_train': 31, 'min_text_length': 14, 'average_text_length': 109.55, 'max_text_length': 1270, 'unique_text': 2025, 'unique_labels': 20, 'labels': {'17': {'count': 102}, '0': {'count': 102}, '11': {'count': 102}, '4': {'count': 103}, '3': {'count': 102}, '1': {'count': 102}, '10': {'count': 102}, '2': {'count': 103}, '16': {'count': 103}, '9': {'count': 103}, '5': {'count': 102}, '7': {'count': 102}, '13': {'count': 102}, '14': {'count': 103}, '12': {'count': 102}, '15': {'count': 103}, '19': {'count': 102}, '18': {'count': 102}, '6': {'count': 103}, '8': {'count': 103}}}, 'train': {'num_samples': 70000, 'number_of_characters': 7760299, 'num_texts_in_train': None, 'min_text_length': 2, 'average_text_length': 110.86, 'max_text_length': 2422, 'unique_text': 68978, 'unique_labels': 20, 'labels': {'12': {'count': 3500}, '1': {'count': 3500}, '19': {'count': 3500}, '15': {'count': 3500}, '13': {'count': 3500}, '11': {'count': 3500}, '17': {'count': 3500}, '14': {'count': 3500}, '16': {'count': 3500}, '5': {'count': 3500}, '0': {'count': 3500}, '8': {'count': 3500}, '7': {'count': 3500}, '2': {'count': 3500}, '3': {'count': 3500}, '10': {'count': 3500}, '6': {'count': 3500}, '18': {'count': 3500}, '4': {'count': 3500}, '9': {'count': 3500}}}} | | [LccSentimentClassification](https://github.com/fnielsen/lcc-sentiment) | ['dan'] | Classification | s2s | [News, Web, Written] | None | None | | [LeCaRDv2](https://github.com/THUIR/LeCaRDv2) (Haitao Li, 2023) | ['zho'] | Retrieval | p2p | [Legal, Written] | None | None | | [LearnedHandsBenefitsLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -340,17 +340,17 @@ The following tables give you an overview of the tasks in MTEB. | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MLQARetrieval](https://huggingface.co/datasets/mlqa) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Written] | None | None | +| [MLQuestions](https://github.com/McGill-NLP/MLQuestions) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | | [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | None | None | | [MLSUMClusteringS2S.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | s2s | [News, Written] | None | None | | [MMarcoReranking](https://github.com/unicamp-dl/mMARCO) (Luiz Henrique Bonifacio, 2021) | ['cmn'] | Reranking | s2s | | None | None | | [MMarcoRetrieval](https://arxiv.org/abs/2309.07597) (Shitao Xiao, 2024) | ['cmn'] | Retrieval | s2p | | None | None | -| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | +| [MSMARCO](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MSMARCO-Fa](https://huggingface.co/datasets/MCINext/msmarco-fa) | ['fas'] | Retrieval | s2p | [Web] | None | None | | [MSMARCO-PL](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | | [MSMARCO-PLHardNegatives](https://microsoft.github.io/msmarco/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Web, Written] | None | None | -| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | -| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Academic, Blog, News, Medical, Government, Reviews, Non-fiction, Social, Web] | None | None | +| [MSMARCOHardNegatives](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | +| [MSMARCOv2](https://microsoft.github.io/msmarco/TREC-Deep-Learning.html) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Academic, Blog, Encyclopaedic, Government, Medical, News, Non-fiction, Reviews, Social, Web] | None | None | | [MTOPDomainClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MTOPIntentClassification](https://arxiv.org/pdf/2008.09335.pdf) | ['deu', 'eng', 'fra', 'hin', 'spa', 'tha'] | Classification | s2s | [Spoken, Spoken] | None | None | | [MacedonianTweetSentimentClassification](https://aclanthology.org/R15-1034/) | ['mkd'] | Classification | s2s | [Social, Written] | None | None | @@ -358,7 +358,7 @@ The following tables give you an overview of the tasks in MTEB. | [MalteseNewsClassification](https://huggingface.co/datasets/MLRS/maltese_news_categories) | ['mlt'] | MultilabelClassification | s2s | [Constructed, Written] | None | None | | [MarathiNewsClassification](https://github.com/goru001/nlp-for-marathi) (Anoop Kunchukuttan, 2020) | ['mar'] | Classification | s2s | [News, Written] | None | None | | [MasakhaNEWSClassification](https://arxiv.org/abs/2304.09972) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Classification | s2s | [News, Written] | None | None | -| [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Written, Non-fiction] | None | None | +| [MasakhaNEWSClusteringP2P](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | p2p | [News, Non-fiction, Written] | None | None | | [MasakhaNEWSClusteringS2S](https://huggingface.co/datasets/masakhane/masakhanews) (David Ifeoluwa Adelani, 2023) | ['amh', 'eng', 'fra', 'hau', 'ibo', 'lin', 'lug', 'orm', 'pcm', 'run', 'sna', 'som', 'swa', 'tir', 'xho', 'yor'] | Clustering | s2s | | None | None | | [MassiveIntentClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | | [MassiveScenarioClassification](https://arxiv.org/abs/2204.08582) (Jack FitzGerald, 2022) | ['afr', 'amh', 'ara', 'aze', 'ben', 'cmo', 'cym', 'dan', 'deu', 'ell', 'eng', 'fas', 'fin', 'fra', 'heb', 'hin', 'hun', 'hye', 'ind', 'isl', 'ita', 'jav', 'jpn', 'kan', 'kat', 'khm', 'kor', 'lav', 'mal', 'mon', 'msa', 'mya', 'nld', 'nob', 'pol', 'por', 'ron', 'rus', 'slv', 'spa', 'sqi', 'swa', 'swe', 'tam', 'tel', 'tgl', 'tha', 'tur', 'urd', 'vie'] | Classification | s2s | [Spoken] | None | None | @@ -372,13 +372,13 @@ The following tables give you an overview of the tasks in MTEB. | [Moroco](https://huggingface.co/datasets/moroco) (Andrei M. Butnaru, 2019) | ['ron'] | Classification | s2s | [News, Written] | None | None | | [MovieReviewSentimentClassification](https://github.com/TheophileBlard/french-sentiment-analysis-with-bert) (Théophile Blard, 2020) | ['fra'] | Classification | s2s | [Reviews, Written] | None | None | | [MrTidyRetrieval](https://huggingface.co/datasets/castorini/mr-tydi) (Xinyu Zhang, 2021) | ['ara', 'ben', 'eng', 'fin', 'ind', 'jpn', 'kor', 'rus', 'swa', 'tel', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Legal, Government, Written] | None | None | +| [MultiEURLEXMultilabelClassification](https://huggingface.co/datasets/coastalcph/multi_eurlex) (Chalkidis et al., 2021) | ['bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'est', 'fin', 'fra', 'hrv', 'hun', 'ita', 'lav', 'lit', 'mlt', 'nld', 'pol', 'por', 'ron', 'slk', 'slv', 'spa', 'swe'] | MultilabelClassification | p2p | [Government, Legal, Written] | None | None | | [MultiHateClassification](https://aclanthology.org/2022.woah-1.15/) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'nld', 'pol', 'por', 'spa'] | Classification | s2s | [Constructed, Written] | None | None | -| [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Written, Web, Non-fiction, Fiction] | None | None | +| [MultiLongDocRetrieval](https://arxiv.org/abs/2402.03216) (Jianlv Chen, 2024) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'por', 'rus', 'spa', 'tha'] | Retrieval | s2p | [Encyclopaedic, Fiction, Non-fiction, Web, Written] | None | None | | [MultilingualSentiment](https://github.com/tyqiangz/multilingual-sentiment-datasets) | ['cmn'] | Classification | s2s | | None | None | | [MultilingualSentimentClassification](https://huggingface.co/datasets/mteb/multilingual-sentiment-classification) | ['ara', 'bam', 'bul', 'cmn', 'cym', 'deu', 'dza', 'ell', 'eng', 'eus', 'fas', 'fin', 'heb', 'hrv', 'ind', 'jpn', 'kor', 'mlt', 'nor', 'pol', 'rus', 'slk', 'spa', 'tha', 'tur', 'uig', 'urd', 'vie', 'zho'] | Classification | s2s | [Reviews, Written] | None | None | | [MyanmarNews](https://huggingface.co/datasets/myanmar_news) (A. H. Khine, 2017) | ['mya'] | Classification | p2p | [News, Written] | None | None | -| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | +| [NFCorpus](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | {'test': 3956} | {'test': {'number_of_characters': 1612.55, 'num_samples': 3956, 'num_queries': 323, 'num_documents': 3633, 'average_document_length': 0.44, 'average_query_length': 0.07, 'average_relevant_docs_per_query': 38.19}} | | [NFCorpus-Fa](https://huggingface.co/datasets/MCINext/nfcorpus-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [NFCorpus-PL](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NLPJournalAbsIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | @@ -386,7 +386,7 @@ The following tables give you an overview of the tasks in MTEB. | [NLPJournalTitleIntroRetrieval](https://github.com/sbintuitions/JMTEB) | ['jpn'] | Retrieval | s2s | [Academic, Written] | None | None | | [NLPTwitterAnalysisClassification](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main) | ['fas'] | Classification | s2p | [Social] | None | None | | [NLPTwitterAnalysisClustering](https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main) | ['fas'] | Clustering | s2s | [Social] | None | None | -| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Written, Encyclopaedic] | None | None | +| [NQ](https://ai.google.com/research/NaturalQuestions/) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [NQ-Fa](https://huggingface.co/datasets/MCINext/nq-fa) | ['fas'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NQ-PL](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [NQ-PLHardNegatives](https://ai.google.com/research/NaturalQuestions/) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | @@ -396,16 +396,16 @@ The following tables give you an overview of the tasks in MTEB. | [NaijaSenti](https://github.com/hausanlp/NaijaSenti) | ['hau', 'ibo', 'pcm', 'yor'] | Classification | s2s | [Social, Written] | None | None | | [NamaaMrTydiReranking](https://huggingface.co/NAMAA-Space) (Muennighoff et al., 2022) | ['ara'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [NanoArguAnaRetrieval](http://argumentation.bplaced.net/arguana/data) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Written] | None | None | -| [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Non-fiction, Academic, News] | None | None | +| [NanoClimateFeverRetrieval](https://arxiv.org/abs/2012.00614) (Thomas Diggelmann, 2021) | ['eng'] | Retrieval | s2p | [Academic, News, Non-fiction] | None | None | | [NanoDBPediaRetrieval](https://huggingface.co/datasets/zeta-alpha-ai/NanoDBPedia) (Lehmann et al., 2015) | ['eng'] | Retrieval | s2p | [Encyclopaedic] | None | None | | [NanoFEVERRetrieval](https://fever.ai/) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic] | None | None | | [NanoFiQA2018Retrieval](https://sites.google.com/view/fiqa/) (Nandan Thakur, 2021) | ['eng'] | Retrieval | s2p | [Academic, Social] | None | None | | [NanoHotpotQARetrieval](https://hotpotqa.github.io/) | ['eng'] | Retrieval | s2p | [Web, Written] | None | None | | [NanoMSMARCORetrieval](https://microsoft.github.io/msmarco/) (Tri Nguyen and Mir Rosenberg and Xia Song and Jianfeng Gao and Saurabh Tiwary and Rangan Majumder and Li Deng, 2016) | ['eng'] | Retrieval | s2p | [Web] | None | None | -| [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [NanoNFCorpusRetrieval](https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/) (Boteva et al., 2016) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoNQRetrieval](https://ai.google.com/research/NaturalQuestions) (Tom Kwiatkowski, 2019) | ['eng'] | Retrieval | s2p | [Academic, Web] | None | None | | [NanoQuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Social] | None | None | -| [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [NanoSCIDOCSRetrieval](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [NanoSciFactRetrieval](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [NanoTouche2020Retrieval](https://webis.de/events/touche-20/shared-task-1.html) | ['eng'] | Retrieval | s2p | [Academic] | None | None | | [NarrativeQARetrieval](https://metatext.io/datasets/narrativeqa) (Tomáš Kočiský, 2017) | ['eng'] | Retrieval | s2p | | None | None | @@ -416,16 +416,16 @@ The following tables give you an overview of the tasks in MTEB. | [NeuCLIR2023RetrievalHardNegatives](https://neuclir.github.io/) (Dawn Lawrie, 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | None | None | | [News21InstructionRetrieval](https://arxiv.org/abs/2403.15246) (Orion Weller, 2024) | ['eng'] | InstructionRetrieval | s2p | [News, Written] | None | None | | [NewsClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [News, Written] | None | None | -| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Written, Reviews] | None | None | -| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Social, Reviews, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | +| [NoRecClassification](https://aclanthology.org/L18-1661/) | ['nob'] | Classification | s2s | [Reviews, Written] | None | None | +| [NollySentiBitextMining](https://github.com/IyanuSh/NollySenti) (Shode et al., 2023) | ['eng', 'hau', 'ibo', 'pcm', 'yor'] | BitextMining | s2s | [Reviews, Social, Written] | {'train': 1640} | {'train': {'num_samples': 1640, 'number_of_characters': 445805, 'unique_pairs': 1632, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 135.52, 'max_sentence2_length': 1728, 'unique_sentence2': 1631, 'hf_subset_descriptive_stats': {'en-ha': {'num_samples': 410, 'number_of_characters': 115348, 'unique_pairs': 407, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 4, 'average_sentence2_length': 145.02, 'max_sentence2_length': 1728, 'unique_sentence2': 407}, 'en-ig': {'num_samples': 410, 'number_of_characters': 107173, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 5, 'average_sentence2_length': 125.08, 'max_sentence2_length': 1137, 'unique_sentence2': 408}, 'en-pcm': {'num_samples': 410, 'number_of_characters': 109955, 'unique_pairs': 408, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 3, 'average_sentence2_length': 131.87, 'max_sentence2_length': 1552, 'unique_sentence2': 408}, 'en-yo': {'num_samples': 410, 'number_of_characters': 113329, 'unique_pairs': 409, 'min_sentence1_length': 3, 'average_sentence1_length': 136.32, 'max_sentence1_length': 1698, 'unique_sentence1': 405, 'min_sentence2_length': 6, 'average_sentence2_length': 140.1, 'max_sentence2_length': 1338, 'unique_sentence2': 409}}}} | | [NorQuadRetrieval](https://aclanthology.org/2023.nodalida-1.17/) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [NordicLangClassification](https://aclanthology.org/2021.vardial-1.8/) | ['dan', 'fao', 'isl', 'nno', 'nob', 'swe'] | Classification | s2s | [Encyclopaedic] | None | None | | [NorwegianCourtsBitextMining](https://opus.nlpl.eu/index.php) (Tiedemann et al., 2020) | ['nno', 'nob'] | BitextMining | s2s | [Legal, Written] | {'test': 228} | {'test': {'num_samples': 228, 'number_of_characters': 37441, 'unique_pairs': 228, 'min_sentence1_length': 13, 'average_sentence1_length': 82.2, 'max_sentence1_length': 272, 'unique_sentence1': 227, 'min_sentence2_length': 10, 'average_sentence2_length': 82.02, 'max_sentence2_length': 269, 'unique_sentence2': 226}} | | [NorwegianParliamentClassification](https://huggingface.co/datasets/NbAiLab/norwegian_parliament) | ['nob'] | Classification | s2s | [Government, Spoken] | None | None | -| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | -| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Non-fiction, Fiction, Written] | None | None | +| [NusaParagraphEmotionClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | +| [NusaParagraphTopicClassification](https://github.com/IndoNLP/nusa-writes) | ['bbc', 'bew', 'bug', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | Classification | s2s | [Fiction, Non-fiction, Written] | None | None | | [NusaTranslationBitextMining](https://huggingface.co/datasets/indonlp/nusatranslation_mt) (Cahyawijaya et al., 2023) | ['abs', 'bbc', 'bew', 'bhp', 'ind', 'jav', 'mad', 'mak', 'min', 'mui', 'rej', 'sun'] | BitextMining | s2s | [Social, Written] | {'train': 50200} | {'train': {'num_samples': 50200, 'number_of_characters': 14759870, 'unique_pairs': 50140, 'min_sentence1_length': 5, 'average_sentence1_length': 145.46, 'max_sentence1_length': 873, 'unique_sentence1': 8258, 'min_sentence2_length': 5, 'average_sentence2_length': 148.57, 'max_sentence2_length': 980, 'unique_sentence2': 50102, 'hf_subset_descriptive_stats': {'ind-abs': {'num_samples': 1000, 'number_of_characters': 295680, 'unique_pairs': 999, 'min_sentence1_length': 5, 'average_sentence1_length': 148.37, 'max_sentence1_length': 727, 'unique_sentence1': 998, 'min_sentence2_length': 6, 'average_sentence2_length': 147.31, 'max_sentence2_length': 629, 'unique_sentence2': 998}, 'ind-btk': {'num_samples': 6600, 'number_of_characters': 1927907, 'unique_pairs': 6597, 'min_sentence1_length': 5, 'average_sentence1_length': 145.37, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 146.74, 'max_sentence2_length': 980, 'unique_sentence2': 6596}, 'ind-bew': {'num_samples': 6600, 'number_of_characters': 1939300, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.41, 'max_sentence2_length': 840, 'unique_sentence2': 6590}, 'ind-bhp': {'num_samples': 1000, 'number_of_characters': 261666, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 133.53, 'max_sentence1_length': 468, 'unique_sentence1': 999, 'min_sentence2_length': 10, 'average_sentence2_length': 128.14, 'max_sentence2_length': 459, 'unique_sentence2': 999}, 'ind-jav': {'num_samples': 6600, 'number_of_characters': 1922162, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 145.81, 'max_sentence2_length': 854, 'unique_sentence2': 6585}, 'ind-mad': {'num_samples': 6600, 'number_of_characters': 1973257, 'unique_pairs': 6598, 'min_sentence1_length': 5, 'average_sentence1_length': 145.36, 'max_sentence1_length': 873, 'unique_sentence1': 6521, 'min_sentence2_length': 5, 'average_sentence2_length': 153.62, 'max_sentence2_length': 827, 'unique_sentence2': 6592}, 'ind-mak': {'num_samples': 6600, 'number_of_characters': 1953868, 'unique_pairs': 6594, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 150.61, 'max_sentence2_length': 888, 'unique_sentence2': 6586}, 'ind-min': {'num_samples': 6600, 'number_of_characters': 1937033, 'unique_pairs': 6595, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 6, 'average_sentence2_length': 148.06, 'max_sentence2_length': 837, 'unique_sentence2': 6591}, 'ind-mui': {'num_samples': 1000, 'number_of_characters': 301448, 'unique_pairs': 1000, 'min_sentence1_length': 11, 'average_sentence1_length': 150.45, 'max_sentence1_length': 451, 'unique_sentence1': 997, 'min_sentence2_length': 11, 'average_sentence2_length': 150.99, 'max_sentence2_length': 450, 'unique_sentence2': 1000}, 'ind-rej': {'num_samples': 1000, 'number_of_characters': 291205, 'unique_pairs': 1000, 'min_sentence1_length': 9, 'average_sentence1_length': 151.62, 'max_sentence1_length': 873, 'unique_sentence1': 998, 'min_sentence2_length': 8, 'average_sentence2_length': 139.58, 'max_sentence2_length': 784, 'unique_sentence2': 1000}, 'ind-sun': {'num_samples': 6600, 'number_of_characters': 1956344, 'unique_pairs': 6591, 'min_sentence1_length': 5, 'average_sentence1_length': 145.43, 'max_sentence1_length': 873, 'unique_sentence1': 6512, 'min_sentence2_length': 5, 'average_sentence2_length': 150.99, 'max_sentence2_length': 881, 'unique_sentence2': 6588}}}} | -| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Reviews, Web, Social, Constructed, Written] | None | None | +| [NusaX-senti](https://arxiv.org/abs/2205.15960) (Winata et al., 2022) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | Classification | s2s | [Constructed, Reviews, Social, Web, Written] | None | None | | [NusaXBitextMining](https://huggingface.co/datasets/indonlp/NusaX-senti/) (Winata et al., 2023) | ['ace', 'ban', 'bbc', 'bjn', 'bug', 'eng', 'ind', 'jav', 'mad', 'min', 'nij', 'sun'] | BitextMining | s2s | [Reviews, Written] | None | None | | [OPP115DataRetentionLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [OPP115DataSecurityLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | @@ -451,7 +451,7 @@ The following tables give you an overview of the tasks in MTEB. | [ParsinluEntail](https://github.com/persiannlp/parsinlu) | ['fas'] | PairClassification | s2s | | None | None | | [ParsinluQueryParaphPC](https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing) | ['fas'] | PairClassification | s2s | | None | None | | [PatentClassification](https://aclanthology.org/P19-1212.pdf) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Web, Encyclopaedic, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | +| [PawsXPairClassification](https://arxiv.org/abs/1908.11828) (Yinfei Yang, 2019) | ['cmn', 'deu', 'eng', 'fra', 'jpn', 'kor', 'spa'] | PairClassification | s2s | [Encyclopaedic, Web, Written] | {'test': 14000, 'validation': 14000} | {'test': {'num_samples': 14000, 'number_of_characters': 2551922, 'min_sentence1_length': 2, 'avg_sentence1_length': 91.18, 'max_sentence1_length': 268, 'unique_sentence1': 13404, 'min_sentence2_length': 2, 'avg_sentence2_length': 91.1, 'max_sentence2_length': 247, 'unique_sentence2': 13462, 'unique_labels': 2, 'labels': {'1': {'count': 6285}, '0': {'count': 7715}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 478034, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.78, 'max_sentence1_length': 268, 'unique_sentence1': 1934, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.24, 'max_sentence2_length': 235, 'unique_sentence2': 1938, 'unique_labels': 2, 'labels': {'1': {'count': 895}, '0': {'count': 1105}}}, 'en': {'num_samples': 2000, 'number_of_characters': 454362, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.76, 'max_sentence1_length': 209, 'unique_sentence1': 1761, 'min_sentence2_length': 25, 'avg_sentence2_length': 113.42, 'max_sentence2_length': 209, 'unique_sentence2': 1800, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'es': {'num_samples': 2000, 'number_of_characters': 471226, 'min_sentence1_length': 2, 'avg_sentence1_length': 117.81, 'max_sentence1_length': 226, 'unique_sentence1': 1955, 'min_sentence2_length': 22, 'avg_sentence2_length': 117.8, 'max_sentence2_length': 233, 'unique_sentence2': 1959, 'unique_labels': 2, 'labels': {'1': {'count': 907}, '0': {'count': 1093}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 480033, 'min_sentence1_length': 2, 'avg_sentence1_length': 120.03, 'max_sentence1_length': 238, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.99, 'max_sentence2_length': 247, 'unique_sentence2': 1953, 'unique_labels': 2, 'labels': {'1': {'count': 903}, '0': {'count': 1097}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 235106, 'min_sentence1_length': 2, 'avg_sentence1_length': 58.68, 'max_sentence1_length': 192, 'unique_sentence1': 1944, 'min_sentence2_length': 2, 'avg_sentence2_length': 58.88, 'max_sentence2_length': 198, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 883}, '0': {'count': 1117}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 260149, 'min_sentence1_length': 2, 'avg_sentence1_length': 64.96, 'max_sentence1_length': 153, 'unique_sentence1': 1954, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.11, 'max_sentence2_length': 159, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 896}, '0': {'count': 1104}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 173012, 'min_sentence1_length': 2, 'avg_sentence1_length': 43.23, 'max_sentence1_length': 120, 'unique_sentence1': 1909, 'min_sentence2_length': 2, 'avg_sentence2_length': 43.27, 'max_sentence2_length': 113, 'unique_sentence2': 1909, 'unique_labels': 2, 'labels': {'1': {'count': 894}, '0': {'count': 1106}}}}}, 'validation': {'num_samples': 14000, 'number_of_characters': 2524625, 'min_sentence1_length': 2, 'avg_sentence1_length': 90.13, 'max_sentence1_length': 248, 'unique_sentence1': 13357, 'min_sentence2_length': 2, 'avg_sentence2_length': 90.2, 'max_sentence2_length': 275, 'unique_sentence2': 13397, 'unique_labels': 2, 'labels': {'1': {'count': 5948}, '0': {'count': 8052}}, 'hf_subset_descriptive_stats': {'de': {'num_samples': 2000, 'number_of_characters': 467643, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.82, 'max_sentence1_length': 248, 'unique_sentence1': 1914, 'min_sentence2_length': 2, 'avg_sentence2_length': 117.0, 'max_sentence2_length': 275, 'unique_sentence2': 1920, 'unique_labels': 2, 'labels': {'1': {'count': 831}, '0': {'count': 1169}}}, 'en': {'num_samples': 2000, 'number_of_characters': 451931, 'min_sentence1_length': 25, 'avg_sentence1_length': 113.11, 'max_sentence1_length': 213, 'unique_sentence1': 1758, 'min_sentence2_length': 25, 'avg_sentence2_length': 112.86, 'max_sentence2_length': 213, 'unique_sentence2': 1771, 'unique_labels': 2, 'labels': {'1': {'count': 863}, '0': {'count': 1137}}}, 'es': {'num_samples': 2000, 'number_of_characters': 466112, 'min_sentence1_length': 2, 'avg_sentence1_length': 116.33, 'max_sentence1_length': 240, 'unique_sentence1': 1938, 'min_sentence2_length': 2, 'avg_sentence2_length': 116.73, 'max_sentence2_length': 241, 'unique_sentence2': 1941, 'unique_labels': 2, 'labels': {'1': {'count': 847}, '0': {'count': 1153}}}, 'fr': {'num_samples': 2000, 'number_of_characters': 478510, 'min_sentence1_length': 2, 'avg_sentence1_length': 119.5, 'max_sentence1_length': 233, 'unique_sentence1': 1933, 'min_sentence2_length': 2, 'avg_sentence2_length': 119.75, 'max_sentence2_length': 246, 'unique_sentence2': 1939, 'unique_labels': 2, 'labels': {'1': {'count': 860}, '0': {'count': 1140}}}, 'ja': {'num_samples': 2000, 'number_of_characters': 229655, 'min_sentence1_length': 2, 'avg_sentence1_length': 57.51, 'max_sentence1_length': 126, 'unique_sentence1': 1957, 'min_sentence2_length': 2, 'avg_sentence2_length': 57.32, 'max_sentence2_length': 121, 'unique_sentence2': 1969, 'unique_labels': 2, 'labels': {'1': {'count': 854}, '0': {'count': 1146}}}, 'ko': {'num_samples': 2000, 'number_of_characters': 261355, 'min_sentence1_length': 2, 'avg_sentence1_length': 65.16, 'max_sentence1_length': 178, 'unique_sentence1': 1963, 'min_sentence2_length': 2, 'avg_sentence2_length': 65.52, 'max_sentence2_length': 174, 'unique_sentence2': 1968, 'unique_labels': 2, 'labels': {'1': {'count': 840}, '0': {'count': 1160}}}, 'zh': {'num_samples': 2000, 'number_of_characters': 169419, 'min_sentence1_length': 2, 'avg_sentence1_length': 42.45, 'max_sentence1_length': 101, 'unique_sentence1': 1899, 'min_sentence2_length': 2, 'avg_sentence2_length': 42.26, 'max_sentence2_length': 120, 'unique_sentence2': 1895, 'unique_labels': 2, 'labels': {'1': {'count': 853}, '0': {'count': 1147}}}}}} | | [PersianFoodSentimentClassification](https://hooshvare.github.io/docs/datasets/sa) (Mehrdad Farahani et al., 2020) | ['fas'] | Classification | s2s | [Reviews, Written] | None | None | | [PersianTextEmotion](https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion) | ['fas'] | Classification | s2s | | None | None | | [PersianTextTone](https://mcinext.com/) | ['fas'] | Classification | s2p | | None | None | @@ -461,31 +461,31 @@ The following tables give you an overview of the tasks in MTEB. | [PlscClusteringP2P.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PlscClusteringS2S.v2](https://huggingface.co/datasets/rafalposwiata/plsc) | ['pol'] | Clustering | s2s | [Academic, Written] | None | None | | [PoemSentimentClassification](https://arxiv.org/abs/2011.02686) (Emily Sheng, 2020) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | -| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | -| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Written, Social] | None | None | -| [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, Non-fiction, Web, Written, Spoken, Social, News] | None | None | +| [PolEmo2.0-IN](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PolEmo2.0-OUT](https://aclanthology.org/K19-1092.pdf) | ['pol'] | Classification | s2s | [Social, Written] | None | None | +| [PpcPC](https://arxiv.org/pdf/2207.12759.pdf) (Sławomir Dadas, 2022) | ['pol'] | PairClassification | s2s | [Fiction, News, Non-fiction, Social, Spoken, Web, Written] | None | None | | [PubChemAISentenceParaphrasePC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemSMILESBitextMining](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | BitextMining | s2s | [Chemistry] | None | None | | [PubChemSMILESPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemSynonymPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemWikiPairClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['ces', 'deu', 'eng', 'fra', 'hin', 'jpn', 'kor', 'msa', 'nld', 'por', 'spa', 'tur', 'zho'] | PairClassification | s2s | [Chemistry] | None | None | | [PubChemWikiParagraphsPC](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | PairClassification | p2p | [Chemistry] | None | None | -| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Medical, Government, Web, Written] | None | None | +| [PublicHealthQA](https://huggingface.co/datasets/xhluca/publichealth-qa) | ['ara', 'eng', 'fra', 'kor', 'rus', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Government, Medical, Web, Written] | None | None | | [PunjabiNewsClassification](https://github.com/goru001/nlp-for-punjabi/) (Anoop Kunchukuttan, 2020) | ['pan'] | Classification | s2s | [News, Written] | None | None | | [QBQTC](https://github.com/CLUEbenchmark/QBQTC/tree/main/dataset) | ['cmn'] | STS | s2s | | None | None | | [Quail](https://text-machine.cs.uml.edu/lab2/projects/quail/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | | [Query2Query](https://mcinext.com/) | ['fas'] | STS | s2s | | None | None | | [Quora-PL](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | | [Quora-PLHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2s | | None | None | -| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Written, Web, Blog] | None | None | +| [QuoraRetrieval](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | [Blog, Web, Written] | None | None | | [QuoraRetrieval-Fa](https://huggingface.co/datasets/MCINext/quora-fa) | ['fas'] | Retrieval | s2s | [Web] | None | None | | [QuoraRetrievalHardNegatives](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) (DataCanary et al., 2017) | ['eng'] | Retrieval | s2s | | None | None | | [RARbCode](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Programming, Written] | None | None | | [RARbMath](https://arxiv.org/abs/2404.06347) (Xiao et al., 2024) | ['eng'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | -| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [News, Web, Encyclopaedic, Written] | None | None | +| [RTE3](https://aclanthology.org/W07-1401/) | ['deu', 'eng', 'fra', 'ita'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [RUParaPhraserSTS](https://aclanthology.org/2020.ngt-1.6) (Pivovarova et al., 2017) | ['rus'] | STS | s2s | [News, Written] | None | None | -| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Social, Written] | None | None | -| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Social, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | +| [RedditClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Social, Web, Written] | None | None | +| [RedditClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Social, Web, Written] | {'test': 459389} | {'test': {'num_samples': 459389, 'number_of_characters': 334286895, 'min_text_length': 79, 'average_text_length': 727.68, 'max_text_length': 4359, 'min_labels_per_text': 2, 'average_labels_per_text': 1.0, 'max_labels_per_text': 77908, 'unique_labels': 440, 'labels': {'FortNiteBR': {'count': 436}, 'buildapc': {'count': 8484}, 'offmychest': {'count': 570}, 'nus': {'count': 45}, 'relationship_advice': {'count': 16651}, 'premed': {'count': 201}, 'dogecoin': {'count': 8108}, 'GamingLaptops': {'count': 183}, 'asktransgender': {'count': 326}, 'MachineLearning': {'count': 61}, 'puppy101': {'count': 1597}, 'GunAccessoriesForSale': {'count': 2619}, 'Random_Acts_Of_Amazon': {'count': 1115}, 'Catholicism': {'count': 183}, 'MonsterHunter': {'count': 218}, 'tipofmypenis': {'count': 87}, 'samsung': {'count': 69}, 'PersonalFinanceCanada': {'count': 341}, 'Dyson_Sphere_Program': {'count': 55}, 'bleach': {'count': 41}, 'AmItheAsshole': {'count': 3730}, 'WallStreetbetsELITE': {'count': 328}, 'GlobalPowers': {'count': 35}, 'ABraThatFits': {'count': 159}, 'PokemonGoFriends': {'count': 1165}, 'NoMansSkyTheGame': {'count': 259}, 'masseffect': {'count': 233}, 'dating_advice': {'count': 559}, 'yoga': {'count': 50}, 'depression': {'count': 515}, 'COVID19positive': {'count': 180}, 'generationology': {'count': 37}, 'feedthebeast': {'count': 192}, 'EliteDangerous': {'count': 270}, 'alcoholicsanonymous': {'count': 93}, 'GoRVing': {'count': 35}, 'thedivision': {'count': 111}, 'breakingmom': {'count': 105}, 'AskAnAmerican': {'count': 80}, 'HypnoFair': {'count': 5}, 'JustUnsubbed': {'count': 13}, 'socialanxiety': {'count': 123}, 'dirtykikpals': {'count': 202}, 'askTO': {'count': 126}, 'AskCulinary': {'count': 108}, 'Bogleheads': {'count': 71}, 'dragonquest': {'count': 45}, 'NoContract': {'count': 30}, 'gorillaz': {'count': 14}, 'MondoGore': {'count': 8}, 'comicswap': {'count': 56}, 'VirtualYoutubers': {'count': 92}, 'Gta5Modding': {'count': 28}, 'obs': {'count': 61}, 'vcu': {'count': 9}, 'KingkillerChronicle': {'count': 17}, 'AmongUs': {'count': 41}, 'wireshark': {'count': 3}, 'Dodocodes': {'count': 46}, 'Aliexpress': {'count': 40}, 'LearnerDriverUK': {'count': 12}, 'PanicAttack': {'count': 23}, 'KassadinMains': {'count': 10}, 'islam': {'count': 93}, 'chronotrigger': {'count': 4}, 'skincareexchange': {'count': 13}, 'PokemonHome': {'count': 21}, 'survivinginfidelity': {'count': 71}, 'igcse': {'count': 21}, 'C25K': {'count': 21}, 'aorus': {'count': 2}, 'idleon': {'count': 19}, 'photography': {'count': 22}, 'cryptocoins': {'count': 7}, 'CanaryWharfBets': {'count': 7}, 'KillingEve': {'count': 7}, 'GameBuilderGarage': {'count': 16}, 'SauceSharingCommunity': {'count': 7}, 'turo': {'count': 9}, 'foodscience': {'count': 14}, 'HIMYM': {'count': 20}, 'HauntingOfHillHouse': {'count': 4}, 'GoodNotes': {'count': 8}, 'RedditWritesSeinfeld': {'count': 6}, 'AirReps': {'count': 2}, 'ADHD': {'count': 3811}, 'BuddyCrossing': {'count': 446}, 'libraryofruina': {'count': 98}, 'SluttyConfessions': {'count': 2787}, 'tipofmytongue': {'count': 7145}, 'fleshlight': {'count': 128}, 'amcstock': {'count': 13910}, 'teenagers': {'count': 77908}, 'suggestmeabook': {'count': 1540}, 'dirtypenpals': {'count': 5587}, 'MinecraftServer': {'count': 177}, 'CreditCards': {'count': 669}, 'Guitar': {'count': 10952}, 'rpg': {'count': 529}, 'NoFap': {'count': 14853}, 'lfg': {'count': 1093}, 'MarsWallStreet': {'count': 935}, 'SummonSign': {'count': 931}, 'AssassinsCreedValhala': {'count': 295}, 'hoi4': {'count': 432}, 'Coins4Sale': {'count': 260}, 'xbox': {'count': 459}, 'TooAfraidToAsk': {'count': 7404}, 'NBA2k': {'count': 553}, 'KGBTR': {'count': 943}, 'roblox': {'count': 220}, 'salesforce': {'count': 214}, 'TwoXChromosomes': {'count': 1736}, 'mechmarket': {'count': 4863}, 'Gaming_Headsets': {'count': 103}, 'pittsburgh': {'count': 189}, 'CryptoMars': {'count': 1606}, 'FridayNightFunkin': {'count': 378}, 'vaginismus': {'count': 122}, 'transpositive': {'count': 10}, 'comicbooks': {'count': 274}, 'BDSMcommunity': {'count': 185}, 'aliens': {'count': 201}, 'Scotch': {'count': 64}, 'KikRoleplay': {'count': 141}, 'Kayaking': {'count': 91}, '196': {'count': 47}, 'digimon': {'count': 140}, 'Evernote': {'count': 42}, 'logh': {'count': 22}, 'arlington': {'count': 15}, 'Adopted': {'count': 8}, 'DissonautUniverse': {'count': 4}, 'Midsommar': {'count': 12}, 'SofiawithanF': {'count': 83}, 'xmpp': {'count': 6}, 'ZombsRoyale': {'count': 16}, 'accesscontrol': {'count': 8}, 'WetlanderHumor': {'count': 2}, 'PoonamPandeyFanatics': {'count': 2}, 'screenplaychallenge': {'count': 2}, 'scatstories': {'count': 2}, 'techsupport': {'count': 290}, 'whatcarshouldIbuy': {'count': 79}, 'Stormlight_Archive': {'count': 15}, 'deadbydaylight': {'count': 126}, 'bicycling': {'count': 27}, 'oculus': {'count': 64}, 'Cartalk': {'count': 33}, 'Sims4': {'count': 43}, 'NoFeeAC': {'count': 95}, 'Crypto_com': {'count': 37}, 'ITCareerQuestions': {'count': 259}, 'aromantic': {'count': 18}, 'Revu': {'count': 3}, 'exalted': {'count': 2}, 'HilariaBaldwin': {'count': 20}, 'Testosterone': {'count': 35}, 'Screenwriting': {'count': 170}, 'LifeProTips': {'count': 49}, 'steinsgate': {'count': 13}, 'Baystreetbets': {'count': 10}, 'AskGirls': {'count': 7}, 'idlechampions': {'count': 7}, 'facebook': {'count': 17}, 'tf2trade': {'count': 4}, 'mfdoom': {'count': 3}, 'FiddlesticksMains': {'count': 2}, 'HFY': {'count': 10}, 'FiestaST': {'count': 2}, 'whatsthatbook': {'count': 994}, 'GearsOfWar': {'count': 879}, 'KazuhaMains': {'count': 175}, 'RepTime': {'count': 211}, 'AstroGaming': {'count': 141}, 'metalgearsolid': {'count': 152}, 'qBittorrent': {'count': 39}, 'ELLIPAL_Official': {'count': 24}, 'raisedbynarcissists': {'count': 4895}, 'unpopularopinion': {'count': 14901}, 'ACTrade': {'count': 5679}, 'askcarsales': {'count': 1339}, 'AskVet': {'count': 1357}, 'whowouldwin': {'count': 4493}, 'playstation': {'count': 1362}, 'anime': {'count': 6531}, 'GME': {'count': 12577}, 'DotA2': {'count': 2004}, 'cryptostreetbets': {'count': 2241}, 'MonsterHunterWorld': {'count': 698}, 'Market76': {'count': 14274}, 'DnD': {'count': 5092}, 'leagueoflegends': {'count': 3683}, 'doordash_drivers': {'count': 1626}, 'theta_network': {'count': 489}, 'exmuslim': {'count': 1369}, 'gonewildaudio': {'count': 2998}, 'conspiracy': {'count': 3587}, 'heroesofthestorm': {'count': 535}, 'FanFiction': {'count': 2782}, 'Doom': {'count': 1251}, 'texas': {'count': 269}, 'Vent': {'count': 1738}, 'selfimprovement': {'count': 1284}, 'youtubers': {'count': 706}, 'askseddit': {'count': 237}, 'boardgames': {'count': 1237}, 'bravelydefault': {'count': 347}, 'ConquerorsBlade': {'count': 238}, 'ChronicPain': {'count': 527}, 'teenagersnew': {'count': 256}, 'brasil': {'count': 1092}, 'MatthiasSubmissions': {'count': 921}, 'MarylandUnemployment': {'count': 314}, 'SaltLakeCity': {'count': 411}, 'BokunoheroFanfiction': {'count': 155}, 'BenignExistence': {'count': 125}, 'GayYoungOldDating': {'count': 156}, 'Bible': {'count': 202}, 'haskell': {'count': 154}, 'seduction': {'count': 400}, 'fantasywriters': {'count': 262}, 'HiveOS': {'count': 100}, 'PerkByDaylight': {'count': 15}, 'Hedgehog': {'count': 73}, 'xmen': {'count': 263}, 'HyperRP': {'count': 122}, 'emotestories': {'count': 3}, 'tutanota': {'count': 135}, 'CultoftheFranklin': {'count': 46}, 'langrisser': {'count': 62}, 'CozyGrove': {'count': 61}, 'Sverigesforsvarsmakt': {'count': 12}, 'silverbugbets': {'count': 21}, 'WreckingBallMains': {'count': 5}, 'capitalism_in_decay': {'count': 8}, 'paintdotnet': {'count': 11}, 'u_mawadom118': {'count': 4}, 'xboxfindfriends': {'count': 2}, 'CPTSD': {'count': 540}, 'destiny2': {'count': 318}, 'Wallstreetsilver': {'count': 1013}, 'DestinyTheGame': {'count': 1107}, 'blackopscoldwar': {'count': 400}, 'InstacartShoppers': {'count': 202}, 'RocketLeagueExchange': {'count': 832}, 'apexlegends': {'count': 3265}, 'kansascity': {'count': 53}, 'namenerds': {'count': 235}, 'help': {'count': 152}, 'Kengan_Ashura': {'count': 132}, 'thetagang': {'count': 165}, 'GameSale': {'count': 262}, 'Reduction': {'count': 109}, 'sex': {'count': 906}, 'bostonr4r': {'count': 75}, 'LegendsOfRuneterra': {'count': 231}, 'overlord': {'count': 48}, 'madisonwi': {'count': 53}, 'steelseries': {'count': 79}, 'ClashOfClansRecruit': {'count': 214}, 'CharacterRant': {'count': 55}, 'AirForce': {'count': 94}, 'sexstories': {'count': 92}, 'NameThatSong': {'count': 162}, 'depressed': {'count': 74}, 'ibs': {'count': 150}, '40kLore': {'count': 269}, 'podcasts': {'count': 88}, 'miraculousladybug': {'count': 150}, 'ask': {'count': 224}, 'EverMerge': {'count': 31}, 'TMJ': {'count': 54}, 'BitLifeApp': {'count': 39}, 'FireEmblemHeroes': {'count': 100}, 'software': {'count': 62}, 'ShieldAndroidTV': {'count': 70}, 'GriefSupport': {'count': 125}, 'onewheel': {'count': 37}, 'MensRights': {'count': 80}, 'nhl': {'count': 22}, 'ClashOfClans': {'count': 107}, 'ps3homebrew': {'count': 33}, 'LightNovels': {'count': 77}, 'redsox': {'count': 34}, 'CryptoMarkets': {'count': 44}, 'ugly': {'count': 47}, 'GCXRep': {'count': 12}, 'cscareerquestionsEU': {'count': 65}, 'MindHunter': {'count': 6}, 'starcraft2coop': {'count': 15}, 'nanocurrency': {'count': 1421}, 'ModelCars': {'count': 8}, 'UKJobs': {'count': 30}, 'Netherlands': {'count': 44}, 'clonewars': {'count': 8}, 'Julia': {'count': 11}, 'Prolactinoma': {'count': 9}, 'sofi': {'count': 11}, 'royalfamily': {'count': 6}, 'ConnecticutR4R': {'count': 8}, 'weather': {'count': 5}, 'oneui': {'count': 7}, 'KTM': {'count': 5}, 'Aerials': {'count': 3}, 'seoul': {'count': 2}, 'exjw': {'count': 3281}, 'ModernMagic': {'count': 699}, 'Paladins': {'count': 1242}, 'kdramarecommends': {'count': 1611}, 'hitbtc': {'count': 330}, 'endocrinology': {'count': 75}, 'Bath': {'count': 43}, 'NassauCountyHookups': {'count': 5}, 'feminineboys': {'count': 1248}, 'dreamsmp': {'count': 2018}, 'SquaredCircle': {'count': 2255}, 'Minecraft': {'count': 8753}, 'spirituality': {'count': 1809}, 'Eldenring': {'count': 1471}, 'Sat': {'count': 1172}, 'bonnaroo': {'count': 194}, 'gardening': {'count': 1892}, 'Unemployment': {'count': 6185}, 'mac': {'count': 1847}, 'Bestbuy': {'count': 437}, 'quittingkratom': {'count': 1081}, 'lawschooladmissions': {'count': 3436}, 'NiceHash': {'count': 2135}, 'McMaster': {'count': 815}, 'covidlonghaulers': {'count': 1299}, 'stalker': {'count': 758}, 'MLBTheShow': {'count': 2721}, 'FortniteCompetitive': {'count': 998}, 'dpdr': {'count': 514}, 'appliancerepair': {'count': 720}, 'thomasthetankengine': {'count': 207}, 'delhi': {'count': 217}, 'Huel': {'count': 300}, 'leafs': {'count': 203}, 'HotWheels': {'count': 170}, '90dayfianceuncensored': {'count': 550}, 'Throwers': {'count': 142}, 'Wavyhair': {'count': 270}, 'CryptoHorde': {'count': 128}, 'ShuumatsuNoValkyrie': {'count': 453}, 'TeensMeetTeens': {'count': 432}, 'dbrand': {'count': 108}, 'SLFmeetups': {'count': 18}, '1200isplentyketo': {'count': 48}, 'passive_income': {'count': 211}, 'BroadCity': {'count': 16}, 'RevenantMain': {'count': 71}, 'extrarfl': {'count': 25}, 'AgonGame': {'count': 5}, 'FitnessDE': {'count': 3}, 'gaming': {'count': 1277}, 'livesound': {'count': 91}, 'IBO': {'count': 1896}, 'EscapefromTarkov': {'count': 1300}, 'amex': {'count': 145}, 'DMAcademy': {'count': 1411}, 'VinylCollectors': {'count': 556}, 'cardano': {'count': 716}, 'brave_browser': {'count': 159}, 'dating': {'count': 952}, 'OculusQuest': {'count': 942}, 'Superstonk': {'count': 3089}, 'MtF': {'count': 957}, 'findaleague': {'count': 207}, 'Nioh': {'count': 398}, 'IRS': {'count': 715}, 'transgendercirclejerk': {'count': 353}, 'learnmath': {'count': 489}, 'piano': {'count': 263}, 'LeagueConnect': {'count': 216}, 'eu4': {'count': 561}, 'Wordpress': {'count': 345}, 'RoleplayingForReddit': {'count': 31}, 'LOONA': {'count': 89}, 'newtothenavy': {'count': 167}, 'HaircareScience': {'count': 118}, 'appletv': {'count': 167}, 'sissypersonals': {'count': 102}, 'raleigh': {'count': 168}, 'realonlyfansreviews': {'count': 21}, 'AskGames': {'count': 49}, 'PokemonTCG': {'count': 325}, 'controlgame': {'count': 109}, 'GoogleDataStudio': {'count': 16}, 'WhiteWolfRPG': {'count': 139}, 'MECoOp': {'count': 31}, 'snuffrp': {'count': 46}, 'lockpicking': {'count': 103}, 'wicked_edge': {'count': 105}, 'BMW': {'count': 99}, 'choiceofgames': {'count': 24}, 'hisdarkmaterials': {'count': 12}, 'SakuraGakuin': {'count': 24}, 'detrans': {'count': 55}, 'Smallville': {'count': 37}, 'kingofqueens': {'count': 7}, 'JamesHoffmann': {'count': 22}, 'stashinvest': {'count': 16}, 'ABA': {'count': 79}, 'ladybusiness': {'count': 10}, 'gamegrumps': {'count': 32}, 'GodEater': {'count': 21}, 'tomorrow': {'count': 39}, 'Tomorrowland': {'count': 9}, 'BlackCountryNewRoad': {'count': 5}, 'STAYC': {'count': 3}, 'SatoshiStreetBets': {'count': 3828}, 'AskLosAngeles': {'count': 1036}, 'buildapcforme': {'count': 1689}, 'ApplyingToCollege': {'count': 10675}, 'watercooling': {'count': 1209}, 'BreakUps': {'count': 4914}, 'FIFA': {'count': 3811}, 'emacs': {'count': 712}, 'trakstocks': {'count': 691}, 'Shittyaskflying': {'count': 147}, 'AmazonFC': {'count': 1178}, 'stocks': {'count': 4610}, 'BangaloreMains': {'count': 26}, 'pokemon': {'count': 3953}, 'religion': {'count': 684}, 'cuboulder': {'count': 269}, 'self': {'count': 1688}, 'tarot': {'count': 912}, 'turtles': {'count': 49}, 'TheMagnusArchives': {'count': 300}, 'Superhero_Ideas': {'count': 34}, 'NTU': {'count': 308}, 'touhou': {'count': 623}, 'JoJolion': {'count': 50}, 'lasers': {'count': 27}, 'popperpigs': {'count': 67}, 'aggretsuko': {'count': 20}, 'Library': {'count': 5}}}} | | [RestaurantReviewSentimentClassification](https://link.springer.com/chapter/10.1007/978-3-319-18117-2_2) (ElSahar et al., 2015) | ['ara'] | Classification | s2s | [Reviews, Written] | None | None | | [RiaNewsRetrieval](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | | [RiaNewsRetrievalHardNegatives](https://arxiv.org/abs/1901.07786) (Gavrilov et al., 2019) | ['rus'] | Retrieval | s2p | [News, Written] | None | None | @@ -514,7 +514,7 @@ The following tables give you an overview of the tasks in MTEB. | [SCDDCertificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDDTrainingLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | | [SCDDVerificationLegalBenchClassification](https://huggingface.co/datasets/nguha/legalbench) (Neel Guha, 2023) | ['eng'] | Classification | s2s | [Legal, Written] | None | None | -| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Written, Non-fiction] | None | None | +| [SCIDOCS](https://allenai.org/data/scidocs) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Non-fiction, Written] | None | None | | [SCIDOCS-Fa](https://huggingface.co/datasets/MCINext/scidocs-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SCIDOCS-PL](https://allenai.org/data/scidocs) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | | None | None | | [SDSEyeProtectionClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2p | [Chemistry] | None | None | @@ -536,25 +536,25 @@ The following tables give you an overview of the tasks in MTEB. | [SNLRetrieval](https://huggingface.co/datasets/navjordj/SNL_summarization) (Navjord et al., 2023) | ['nob'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Written] | None | None | | [SRNCorpusBitextMining](https://arxiv.org/abs/2212.06383) (Zwennicker et al., 2022) | ['nld', 'srn'] | BitextMining | s2s | [Social, Web, Written] | None | None | | [STS12](https://www.aclweb.org/anthology/S12-1051.pdf) (Agirre et al., 2012) | ['eng'] | STS | s2s | [Encyclopaedic, News, Written] | {'test': 3108} | {'test': {'num_samples': 3108, 'number_of_characters': 402118, 'min_sentence1_length': 3, 'average_sentence1_len': 63.79, 'max_sentence1_length': 220, 'unique_sentence1': 2236, 'min_sentence2_length': 7, 'average_sentence2_len': 65.59, 'max_sentence2_length': 204, 'unique_sentence2': 2797, 'min_score': 0.0, 'avg_score': 3.51, 'max_score': 5.0}} | -| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [Web, News, Non-fiction, Written] | None | None | -| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | None | None | -| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Web, Written, Spoken] | None | None | -| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Web, Spoken] | None | None | +| [STS13](https://www.aclweb.org/anthology/S13-1004/) (Eneko Agirre, 2013) | ['eng'] | STS | s2s | [News, Non-fiction, Web, Written] | None | None | +| [STS14](https://www.aclweb.org/anthology/S14-1002) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | +| [STS15](https://www.aclweb.org/anthology/S15-2010) | ['eng'] | STS | s2s | [Blog, News, Spoken, Web, Written] | None | None | +| [STS16](https://www.aclweb.org/anthology/S16-1001) | ['eng'] | STS | s2s | [Blog, Spoken, Web] | None | None | | [STS17](https://alt.qcri.org/semeval2017/task1/) | ['ara', 'deu', 'eng', 'fra', 'ita', 'kor', 'nld', 'spa', 'tur'] | STS | s2s | [News, Web, Written] | {'test': 5346} | {'test': {'num_samples': 5346, 'number_of_characters': 400264, 'min_sentence1_length': 6, 'average_sentence1_len': 38.15, 'max_sentence1_length': 976, 'unique_sentence1': 4900, 'min_sentence2_length': 6, 'average_sentence2_len': 36.73, 'max_sentence2_length': 1007, 'unique_sentence2': 4470, 'min_score': 0.0, 'avg_score': 2.36, 'max_score': 5.0, 'hf_subset_descriptive_stats': {'ko-ko': {'num_samples': 2846, 'number_of_characters': 183387, 'min_sentence1_length': 6, 'average_sentence1_len': 31.99, 'max_sentence1_length': 976, 'unique_sentence1': 2650, 'min_sentence2_length': 6, 'average_sentence2_len': 32.44, 'max_sentence2_length': 1007, 'unique_sentence2': 2720, 'min_score': 0.0, 'avg_score': 2.47, 'max_score': 5.0}, 'ar-ar': {'num_samples': 250, 'number_of_characters': 16247, 'min_sentence1_length': 11, 'average_sentence1_len': 32.21, 'max_sentence1_length': 99, 'unique_sentence1': 250, 'min_sentence2_length': 9, 'average_sentence2_len': 32.78, 'max_sentence2_length': 83, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.22, 'max_score': 5.0}, 'en-ar': {'num_samples': 250, 'number_of_characters': 18764, 'min_sentence1_length': 13, 'average_sentence1_len': 42.36, 'max_sentence1_length': 105, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 32.7, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.14, 'max_score': 5.0}, 'en-de': {'num_samples': 250, 'number_of_characters': 22177, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 44.76, 'max_sentence2_length': 104, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-en': {'num_samples': 250, 'number_of_characters': 21669, 'min_sentence1_length': 12, 'average_sentence1_len': 43.95, 'max_sentence1_length': 94, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'en-tr': {'num_samples': 250, 'number_of_characters': 20879, 'min_sentence1_length': 15, 'average_sentence1_len': 41.92, 'max_sentence1_length': 101, 'unique_sentence1': 250, 'min_sentence2_length': 10, 'average_sentence2_len': 41.6, 'max_sentence2_length': 107, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.13, 'max_score': 5.0}, 'es-en': {'num_samples': 250, 'number_of_characters': 23216, 'min_sentence1_length': 12, 'average_sentence1_len': 50.84, 'max_sentence1_length': 160, 'unique_sentence1': 250, 'min_sentence2_length': 14, 'average_sentence2_len': 42.02, 'max_sentence2_length': 117, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.15, 'max_score': 5.0}, 'es-es': {'num_samples': 250, 'number_of_characters': 25265, 'min_sentence1_length': 18, 'average_sentence1_len': 49.84, 'max_sentence1_length': 136, 'unique_sentence1': 250, 'min_sentence2_length': 13, 'average_sentence2_len': 51.22, 'max_sentence2_length': 129, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.23, 'max_score': 5.0}, 'fr-en': {'num_samples': 250, 'number_of_characters': 23087, 'min_sentence1_length': 19, 'average_sentence1_len': 49.62, 'max_sentence1_length': 115, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'it-en': {'num_samples': 250, 'number_of_characters': 23188, 'min_sentence1_length': 15, 'average_sentence1_len': 50.03, 'max_sentence1_length': 113, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}, 'nl-en': {'num_samples': 250, 'number_of_characters': 22385, 'min_sentence1_length': 14, 'average_sentence1_len': 46.82, 'max_sentence1_length': 123, 'unique_sentence1': 250, 'min_sentence2_length': 15, 'average_sentence2_len': 42.72, 'max_sentence2_length': 101, 'unique_sentence2': 250, 'min_score': 0.0, 'avg_score': 2.28, 'max_score': 5.0}}}} | | [STS22.v2](https://competitions.codalab.org/competitions/33835) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'ita', 'pol', 'rus', 'spa', 'tur'] | STS | p2p | [News, Written] | None | None | | [STSB](https://aclanthology.org/2021.emnlp-main.357) (Shitao Xiao, 2024) | ['cmn'] | STS | s2s | | None | None | | [STSBenchmark](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['eng'] | STS | s2s | [Blog, News, Written] | None | None | -| [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Web, Spoken, Written] | None | None | +| [STSBenchmarkMultilingualSTS](https://github.com/PhilipMay/stsb-multi-mt/) (Philip May, 2021) | ['cmn', 'deu', 'eng', 'fra', 'ita', 'nld', 'pol', 'por', 'rus', 'spa'] | STS | s2s | [News, Social, Spoken, Web, Written] | None | None | | [STSES](https://huggingface.co/datasets/PlanTL-GOB-ES/sts-es) (Agirre et al., 2015) | ['spa'] | STS | s2s | [Written] | None | None | | [SadeemQuestionRetrieval](https://huggingface.co/datasets/sadeem-ai/sadeem-ar-eval-retrieval-questions) | ['ara'] | Retrieval | s2p | [Written, Written] | None | None | | [SanskritShlokasClassification](https://github.com/goru001/nlp-for-sanskrit) | ['san'] | Classification | s2s | [Religious, Written] | None | None | -| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Fiction, News, Non-fiction, Blog, Spoken, Web, Written] | None | None | +| [ScalaClassification](https://aclanthology.org/2023.nodalida-1.20/) | ['dan', 'nno', 'nob', 'swe'] | Classification | s2s | [Blog, Fiction, News, Non-fiction, Spoken, Web, Written] | None | None | | [SciDocsRR](https://allenai.org/data/scidocs) | ['eng'] | Reranking | s2s | [Academic, Non-fiction, Written] | None | None | | [SciFact](https://github.com/allenai/scifact) (Arman Cohan, 2020) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SciFact-Fa](https://huggingface.co/datasets/MCINext/scifact-fa) | ['fas'] | Retrieval | s2p | [Academic] | None | None | | [SciFact-PL](https://github.com/allenai/scifact) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [SemRel24STS](https://huggingface.co/datasets/SemRel/SemRel2024) (Nedjma Ousidhoum, 2024) | ['afr', 'amh', 'arb', 'arq', 'ary', 'eng', 'hau', 'hin', 'ind', 'kin', 'mar', 'tel'] | STS | s2s | [Spoken, Written] | None | None | -| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Web, Social, Written] | None | None | +| [SensitiveTopicsClassification](https://aclanthology.org/2021.bsnlp-1.4) | ['rus'] | MultilabelClassification | s2s | [Social, Web, Written] | None | None | | [SentimentAnalysisHindi](https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi) (Shantipriya Parida, 2023) | ['hin'] | Classification | s2s | [Reviews, Written] | None | None | | [SentimentDKSF](https://github.com/hezarai/hezar) | ['fas'] | Classification | s2p | [Reviews] | None | None | | [SinhalaNewsClassification](https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification) (Nisansa de Silva, 2015) | ['sin'] | Classification | s2s | [News, Written] | None | None | @@ -563,7 +563,7 @@ The following tables give you an overview of the tasks in MTEB. | [SlovakHateSpeechClassification](https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak) | ['slk'] | Classification | s2s | [Social, Written] | {'test': 1319, 'train': 11870} | {'test': {'num_samples': 1319, 'number_of_characters': 122279, 'num_texts_in_train': 46, 'min_text_length': 8, 'average_text_length': 92.71, 'max_text_length': 1584, 'unique_text': 1315, 'unique_labels': 2, 'labels': {'1': {'count': 360}, '0': {'count': 959}}}, 'train': {'num_samples': 11870, 'number_of_characters': 1130860, 'num_texts_in_train': None, 'min_text_length': 7, 'average_text_length': 95.27, 'max_text_length': 2112, 'unique_text': 11655, 'unique_labels': 2, 'labels': {'1': {'count': 3245}, '0': {'count': 8625}}}} | | [SlovakMovieReviewSentimentClassification](https://arxiv.org/pdf/2304.01922) ({ {S, 2023) | ['svk'] | Classification | s2s | [Reviews, Written] | None | None | | [SlovakSumRetrieval](https://huggingface.co/datasets/NaiveNeuron/slovaksum) | ['slk'] | Retrieval | s2s | [News, Social, Web, Written] | None | None | -| [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Web, Non-fiction, Written] | None | None | +| [SouthAfricanLangClassification](https://www.kaggle.com/competitions/south-african-language-identification/) (ExploreAI Academy et al., 2022) | ['afr', 'eng', 'nbl', 'nso', 'sot', 'ssw', 'tsn', 'tso', 'ven', 'xho', 'zul'] | Classification | s2s | [Non-fiction, Web, Written] | None | None | | [SpanishNewsClassification](https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news) | ['spa'] | Classification | s2s | [News, Written] | None | None | | [SpanishNewsClusteringP2P](https://www.kaggle.com/datasets/kevinmorgado/spanish-news-classification) | ['spa'] | Clustering | p2p | | None | None | | [SpanishPassageRetrievalS2P](https://mklab.iti.gr/results/spanish-passage-retrieval-dataset/) | ['spa'] | Retrieval | s2p | | None | None | @@ -573,7 +573,7 @@ The following tables give you an overview of the tasks in MTEB. | [SprintDuplicateQuestions](https://www.aclweb.org/anthology/D18-1131/) | ['eng'] | PairClassification | s2s | [Programming, Written] | None | None | | [StackExchangeClustering.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | s2s | [Web, Written] | None | None | | [StackExchangeClusteringP2P.v2](https://arxiv.org/abs/2104.07081) (Gregor Geigle, 2021) | ['eng'] | Clustering | p2p | [Web, Written] | None | None | -| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Written, Blog, Programming] | None | None | +| [StackOverflowDupQuestions](https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf) (Xueqing Liu, 2018) | ['eng'] | Reranking | s2s | [Blog, Programming, Written] | None | None | | [StackOverflowQA](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 21925} | {'test': {'number_of_characters': 26584028, 'num_samples': 21925, 'num_queries': 1994, 'num_documents': 19931, 'min_document_length': 61, 'average_document_length': 130.32, 'max_document_length': 22234, 'unique_documents': 19931, 'min_query_length': 5, 'average_query_length': 12029.38, 'max_query_length': 46028, 'unique_queries': 1994, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1994}} | | [StatcanDialogueDatasetRetrieval](https://mcgill-nlp.github.io/statcan-dialogue-dataset/) | ['eng', 'fra'] | Retrieval | s2p | [Government, Web, Written] | None | None | | [SummEvalFrSummarization.v2](https://github.com/Yale-LILY/SummEval) (Fabbri et al., 2020) | ['fra'] | Summarization | p2p | [News, Written] | None | None | @@ -609,10 +609,10 @@ The following tables give you an overview of the tasks in MTEB. | [SynPerChatbotToneChatbotClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotToneUserClassification](https://mcinext.com/) | ['fas'] | Classification | p2p | [Spoken] | None | None | | [SynPerChatbotTopicsRetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval) | ['fas'] | Retrieval | s2p | [Spoken] | None | None | -| [SynPerQAPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Web, News, Religious, Blog] | None | None | +| [SynPerQAPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Blog, News, Religious, Web] | None | None | | [SynPerQARetrieval](https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings) | ['fas'] | Retrieval | s2p | [Web] | None | None | -| [SynPerSTS](https://mcinext.com/) | ['fas'] | STS | s2s | [Web, News, Religious, Blog] | None | None | -| [SynPerTextKeywordsPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Web, News, Religious, Blog] | None | None | +| [SynPerSTS](https://mcinext.com/) | ['fas'] | STS | s2s | [Blog, News, Religious, Web] | None | None | +| [SynPerTextKeywordsPC](https://mcinext.com/) | ['fas'] | PairClassification | s2p | [Blog, News, Religious, Web] | None | None | | [SyntecReranking](https://huggingface.co/datasets/lyon-nlp/mteb-fr-reranking-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Reranking | s2p | [Legal, Written] | None | None | | [SyntecRetrieval](https://huggingface.co/datasets/lyon-nlp/mteb-fr-retrieval-syntec-s2p) (Mathieu Ciancone, 2024) | ['fra'] | Retrieval | s2p | [Legal, Written] | None | None | | [SyntheticText2SQL](https://huggingface.co/datasets/gretelai/synthetic_text_to_sql) (Meyer et al., 2024) | ['eng', 'sql'] | Retrieval | p2p | [Programming, Written] | {'test': 111702} | {'test': {'number_of_characters': 14041553, 'num_samples': 111702, 'num_queries': 5851, 'num_documents': 105851, 'min_document_length': 13, 'average_document_length': 4.58, 'max_document_length': 281, 'unique_documents': 105851, 'min_query_length': 17, 'average_query_length': 2316.95, 'max_query_length': 762, 'unique_queries': 5851, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 5851}} | @@ -620,7 +620,7 @@ The following tables give you an overview of the tasks in MTEB. | [T2Retrieval](https://arxiv.org/abs/2304.03679) (Xiaohui Xie, 2023) | ['cmn'] | Retrieval | s2p | | None | None | | [TERRa](https://arxiv.org/pdf/2010.15925) (Shavrina et al., 2020) | ['rus'] | PairClassification | s2s | [News, Web, Written] | None | None | | [TNews](https://www.cluebenchmarks.com/introduce.html) | ['cmn'] | Classification | s2s | | None | None | -| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Medical, Academic, Written] | None | None | +| [TRECCOVID](https://ir.nist.gov/covidSubmit/index.html) (Kirk Roberts, 2021) | ['eng'] | Retrieval | s2p | [Academic, Medical, Written] | None | None | | [TRECCOVID-Fa](https://huggingface.co/datasets/MCINext/trec-covid-fa) | ['fas'] | Retrieval | s2p | [Medical] | None | None | | [TRECCOVID-PL](https://ir.nist.gov/covidSubmit/index.html) (Konrad Wojtasik, 2024) | ['pol'] | Retrieval | s2p | [Academic, Medical, Non-fiction, Written] | None | None | | [TV2Nordretrieval](https://huggingface.co/datasets/alexandrainst/nordjylland-news-summarization) | ['dan'] | Retrieval | p2p | [News, Non-fiction, Written] | None | None | @@ -650,7 +650,7 @@ The following tables give you an overview of the tasks in MTEB. | [ToxicChatClassification](https://aclanthology.org/2023.findings-emnlp.311/) (Zi Lin, 2023) | ['eng'] | Classification | s2s | [Constructed, Written] | None | None | | [ToxicConversationsClassification](https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview) (cjadams, 2019) | ['eng'] | Classification | s2s | [Social, Written] | None | None | | [TswanaNewsClassification](https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17) (Vukosi Marivate, 2023) | ['tsn'] | Classification | s2s | [News, Written] | None | None | -| [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Encyclopaedic, Non-fiction, Academic, Written] | None | None | +| [TurHistQuadRetrieval](https://github.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset) (Soygazi et al., 2021) | ['tur'] | Retrieval | p2p | [Academic, Encyclopaedic, Non-fiction, Written] | None | None | | [TurkicClassification](https://huggingface.co/datasets/Electrotubbie/classification_Turkic_languages/) | ['bak', 'kaz', 'kir'] | Classification | s2s | [News, Written] | None | None | | [TurkishMovieSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | | [TurkishProductSentimentClassification](https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf) (Erkin Demirtas, 2013) | ['tur'] | Classification | s2s | [Reviews, Written] | None | None | @@ -658,7 +658,7 @@ The following tables give you an overview of the tasks in MTEB. | [TweetSarcasmClassification](https://aclanthology.org/2020.osact-1.5/) | ['ara'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentClassification](https://aclanthology.org/2022.lrec-1.27) | ['ara', 'deu', 'eng', 'fra', 'hin', 'ita', 'por', 'spa'] | Classification | s2s | [Social, Written] | None | None | | [TweetSentimentExtractionClassification](https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview) (Maggie et al., 2020) | ['eng'] | Classification | s2s | [Social, Written] | None | None | -| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [Social, News, Written] | None | None | +| [TweetTopicSingleClassification](https://arxiv.org/abs/2209.09824) | ['eng'] | Classification | s2s | [News, Social, Written] | None | None | | [TwentyNewsgroupsClustering.v2](https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html) (Ken Lang, 1995) | ['eng'] | Clustering | s2s | [News, Written] | {'test': 59545} | {'test': {'num_samples': 59545, 'number_of_characters': 1907719, 'min_text_length': 11, 'average_text_length': 32.04, 'max_text_length': 120, 'min_labels_per_text': 2082, 'average_labels_per_text': 1.0, 'max_labels_per_text': 3236, 'unique_labels': 20, 'labels': {'12': {'count': 3137}, '6': {'count': 3070}, '0': {'count': 2613}, '2': {'count': 3155}, '10': {'count': 3220}, '17': {'count': 2986}, '14': {'count': 3106}, '13': {'count': 3055}, '1': {'count': 3056}, '16': {'count': 2911}, '9': {'count': 2984}, '3': {'count': 3070}, '15': {'count': 3090}, '7': {'count': 3036}, '5': {'count': 3124}, '11': {'count': 3236}, '18': {'count': 2483}, '8': {'count': 3090}, '19': {'count': 2082}, '4': {'count': 3041}}}} | | [TwitterHjerneRetrieval](https://huggingface.co/datasets/sorenmulli/da-hashtag-twitterhjerne) (Holm et al., 2024) | ['dan'] | Retrieval | p2p | [Social, Written] | None | None | | [TwitterSemEval2015](https://alt.qcri.org/semeval2015/task1/) | ['eng'] | PairClassification | s2s | [Social, Written] | None | None | @@ -699,17 +699,17 @@ The following tables give you an overview of the tasks in MTEB. | [WikipediaSpecialtiesInChemistryClustering](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Clustering | s2p | [Chemistry] | None | None | | [WikipediaTheoreticalAppliedClassification](https://arxiv.org/abs/2412.00532) (Kasmaee et al., 2024) | ['eng'] | Classification | s2s | [Chemistry] | None | None | | [WinoGrande](https://winogrande.allenai.org/) (Xiao et al., 2024) | ['eng'] | Retrieval | s2s | [Encyclopaedic, Written] | None | None | -| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [Social, News, Written] | None | None | +| [WisesightSentimentClassification](https://github.com/PyThaiNLP/wisesight-sentiment) | ['tha'] | Classification | s2s | [News, Social, Written] | None | None | | XMarket (Bonab et al., 2021) | ['deu', 'eng', 'spa'] | Retrieval | s2p | | None | None | -| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | -| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Non-fiction, Fiction, Government, Written] | None | None | +| [XNLI](https://aclanthology.org/D18-1269/) (Conneau et al., 2018) | ['ara', 'bul', 'deu', 'ell', 'eng', 'fra', 'hin', 'rus', 'spa', 'swa', 'tha', 'tur', 'vie', 'zho'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | {'test': 19110, 'validation': 19110} | {'test': {'num_samples': 19110, 'number_of_characters': 2907145, 'min_sentence1_length': 3, 'avg_sentence1_length': 103.24, 'max_sentence1_length': 401, 'unique_sentence1': 15328, 'min_sentence2_length': 2, 'avg_sentence2_length': 48.89, 'max_sentence2_length': 187, 'unique_sentence2': 19104, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 179591, 'min_sentence1_length': 11, 'avg_sentence1_length': 89.57, 'max_sentence1_length': 242, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 41.99, 'max_sentence2_length': 115, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 220646, 'min_sentence1_length': 14, 'avg_sentence1_length': 110.02, 'max_sentence1_length': 303, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 51.63, 'max_sentence2_length': 150, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241224, 'min_sentence1_length': 3, 'avg_sentence1_length': 119.93, 'max_sentence1_length': 301, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 56.79, 'max_sentence2_length': 187, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 240222, 'min_sentence1_length': 13, 'avg_sentence1_length': 119.05, 'max_sentence1_length': 344, 'unique_sentence1': 1095, 'min_sentence2_length': 13, 'avg_sentence2_length': 56.93, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212223, 'min_sentence1_length': 19, 'avg_sentence1_length': 105.67, 'max_sentence1_length': 268, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 49.8, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232207, 'min_sentence1_length': 11, 'avg_sentence1_length': 115.43, 'max_sentence1_length': 385, 'unique_sentence1': 1094, 'min_sentence2_length': 8, 'avg_sentence2_length': 54.68, 'max_sentence2_length': 163, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 245259, 'min_sentence1_length': 9, 'avg_sentence1_length': 121.1, 'max_sentence1_length': 327, 'unique_sentence1': 1095, 'min_sentence2_length': 10, 'avg_sentence2_length': 58.58, 'max_sentence2_length': 169, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 211312, 'min_sentence1_length': 16, 'avg_sentence1_length': 104.63, 'max_sentence1_length': 401, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 50.17, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 222797, 'min_sentence1_length': 11, 'avg_sentence1_length': 110.77, 'max_sentence1_length': 306, 'unique_sentence1': 1095, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.45, 'max_sentence2_length': 167, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210103, 'min_sentence1_length': 10, 'avg_sentence1_length': 104.44, 'max_sentence1_length': 266, 'unique_sentence1': 1094, 'min_sentence2_length': 2, 'avg_sentence2_length': 49.48, 'max_sentence2_length': 146, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192788, 'min_sentence1_length': 12, 'avg_sentence1_length': 96.69, 'max_sentence1_length': 262, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 44.54, 'max_sentence2_length': 129, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208658, 'min_sentence1_length': 15, 'avg_sentence1_length': 103.68, 'max_sentence1_length': 255, 'unique_sentence1': 1095, 'min_sentence2_length': 6, 'avg_sentence2_length': 49.19, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 223549, 'min_sentence1_length': 14, 'avg_sentence1_length': 111.31, 'max_sentence1_length': 265, 'unique_sentence1': 1095, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.46, 'max_sentence2_length': 143, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 66566, 'min_sentence1_length': 4, 'avg_sentence1_length': 33.04, 'max_sentence1_length': 112, 'unique_sentence1': 1095, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.73, 'max_sentence2_length': 59, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}, 'validation': {'num_samples': 19110, 'number_of_characters': 2909058, 'min_sentence1_length': 5, 'avg_sentence1_length': 103.21, 'max_sentence1_length': 323, 'unique_sentence1': 11171, 'min_sentence2_length': 3, 'avg_sentence2_length': 49.02, 'max_sentence2_length': 172, 'unique_sentence2': 19101, 'unique_labels': 2, 'labels': {'0': {'count': 9562}, '1': {'count': 9548}}, 'hf_subset_descriptive_stats': {'ar': {'num_samples': 1365, 'number_of_characters': 177355, 'min_sentence1_length': 13, 'avg_sentence1_length': 88.32, 'max_sentence1_length': 214, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 41.61, 'max_sentence2_length': 137, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'bg': {'num_samples': 1365, 'number_of_characters': 219988, 'min_sentence1_length': 16, 'avg_sentence1_length': 109.2, 'max_sentence1_length': 316, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 51.97, 'max_sentence2_length': 151, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'de': {'num_samples': 1365, 'number_of_characters': 241852, 'min_sentence1_length': 20, 'avg_sentence1_length': 119.81, 'max_sentence1_length': 298, 'unique_sentence1': 798, 'min_sentence2_length': 12, 'avg_sentence2_length': 57.37, 'max_sentence2_length': 162, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'el': {'num_samples': 1365, 'number_of_characters': 241275, 'min_sentence1_length': 16, 'avg_sentence1_length': 119.88, 'max_sentence1_length': 302, 'unique_sentence1': 798, 'min_sentence2_length': 6, 'avg_sentence2_length': 56.88, 'max_sentence2_length': 171, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'en': {'num_samples': 1365, 'number_of_characters': 212384, 'min_sentence1_length': 20, 'avg_sentence1_length': 105.72, 'max_sentence1_length': 271, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'es': {'num_samples': 1365, 'number_of_characters': 232451, 'min_sentence1_length': 14, 'avg_sentence1_length': 115.17, 'max_sentence1_length': 265, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 55.12, 'max_sentence2_length': 148, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'fr': {'num_samples': 1365, 'number_of_characters': 246857, 'min_sentence1_length': 19, 'avg_sentence1_length': 121.76, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 11, 'avg_sentence2_length': 59.09, 'max_sentence2_length': 172, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'hi': {'num_samples': 1365, 'number_of_characters': 212269, 'min_sentence1_length': 18, 'avg_sentence1_length': 105.06, 'max_sentence1_length': 277, 'unique_sentence1': 798, 'min_sentence2_length': 7, 'avg_sentence2_length': 50.44, 'max_sentence2_length': 152, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'ru': {'num_samples': 1365, 'number_of_characters': 221152, 'min_sentence1_length': 15, 'avg_sentence1_length': 109.75, 'max_sentence1_length': 310, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 52.27, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'sw': {'num_samples': 1365, 'number_of_characters': 210482, 'min_sentence1_length': 13, 'avg_sentence1_length': 104.32, 'max_sentence1_length': 264, 'unique_sentence1': 798, 'min_sentence2_length': 8, 'avg_sentence2_length': 49.88, 'max_sentence2_length': 153, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'th': {'num_samples': 1365, 'number_of_characters': 192640, 'min_sentence1_length': 7, 'avg_sentence1_length': 97.28, 'max_sentence1_length': 255, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 43.84, 'max_sentence2_length': 140, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'tr': {'num_samples': 1365, 'number_of_characters': 208305, 'min_sentence1_length': 15, 'avg_sentence1_length': 102.97, 'max_sentence1_length': 269, 'unique_sentence1': 798, 'min_sentence2_length': 10, 'avg_sentence2_length': 49.64, 'max_sentence2_length': 139, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'vi': {'num_samples': 1365, 'number_of_characters': 224811, 'min_sentence1_length': 18, 'avg_sentence1_length': 112.26, 'max_sentence1_length': 323, 'unique_sentence1': 798, 'min_sentence2_length': 9, 'avg_sentence2_length': 52.43, 'max_sentence2_length': 159, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}, 'zh': {'num_samples': 1365, 'number_of_characters': 67237, 'min_sentence1_length': 5, 'avg_sentence1_length': 33.41, 'max_sentence1_length': 135, 'unique_sentence1': 798, 'min_sentence2_length': 3, 'avg_sentence2_length': 15.85, 'max_sentence2_length': 66, 'unique_sentence2': 1365, 'unique_labels': 2, 'labels': {'0': {'count': 683}, '1': {'count': 682}}}}}} | +| [XNLIV2](https://arxiv.org/pdf/2301.06527) (Upadhyay et al., 2023) | ['asm', 'ben', 'bho', 'ell', 'guj', 'kan', 'mar', 'ory', 'pan', 'rus', 'san', 'tam', 'tur'] | PairClassification | s2s | [Fiction, Government, Non-fiction, Written] | None | None | | [XPQARetrieval](https://arxiv.org/abs/2305.09249) (Shen et al., 2023) | ['ara', 'cmn', 'deu', 'eng', 'fra', 'hin', 'ita', 'jpn', 'kor', 'pol', 'por', 'spa', 'tam'] | Retrieval | s2p | [Reviews, Written] | None | None | | [XQuADRetrieval](https://huggingface.co/datasets/xquad) (Mikel Artetxe, 2019) | ['arb', 'deu', 'ell', 'eng', 'hin', 'ron', 'rus', 'spa', 'tha', 'tur', 'vie', 'zho'] | Retrieval | s2p | [Web, Written] | None | None | | [XStance](https://github.com/ZurichNLP/xstance) | ['deu', 'fra', 'ita'] | PairClassification | s2s | [Social, Written] | None | None | | [YahooAnswersTopicsClassification](https://huggingface.co/datasets/yahoo_answers_topics) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Web, Written] | None | None | | [YelpReviewFullClassification](https://arxiv.org/abs/1509.01626) (Zhang et al., 2015) | ['eng'] | Classification | s2s | [Reviews, Written] | None | None | | [YueOpenriceReviewClassification](https://github.com/Christainx/Dataset_Cantonese_Openrice) (Xiang et al., 2019) | ['yue'] | Classification | s2s | [Reviews, Spoken] | None | None | -| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, Web, News, Written] | None | None | +| [indonli](https://link.springer.com/chapter/10.1007/978-3-030-41505-1_39) | ['ind'] | PairClassification | s2s | [Encyclopaedic, News, Web, Written] | None | None | | [mFollowIRCrossLingualInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['eng', 'fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283654099, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 75, 'min_instruction_length': 93, 'average_instruction_length': 389.95, 'max_instruction_length': 887, 'unique_instructions': 75, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 450.55, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'eng-fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129597567, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 80.08, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 150, 'average_instruction_length': 396.88, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 205, 'average_changed_instruction_length': 463.18, 'max_changed_instruction_length': 974, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109522175, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 32, 'average_query_length': 81.88, 'max_query_length': 173, 'unique_queries': 40, 'min_instruction_length': 93, 'average_instruction_length': 371.12, 'max_instruction_length': 887, 'unique_instructions': 40, 'min_changed_instruction_length': 180, 'average_changed_instruction_length': 431.8, 'max_changed_instruction_length': 957, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'eng-zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44534357, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 32, 'average_query_length': 83.56, 'max_query_length': 159, 'unique_queries': 43, 'min_instruction_length': 157, 'average_instruction_length': 401.02, 'max_instruction_length': 731, 'unique_instructions': 43, 'min_changed_instruction_length': 209, 'average_changed_instruction_length': 456.26, 'max_changed_instruction_length': 822, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | | [mFollowIRInstructionRetrieval](https://neuclir.github.io/) (Weller et al., 2024) | ['fas', 'rus', 'zho'] | Retrieval | s2p | [News, Written] | {'test': 121758} | {'test': {'num_samples': 121758, 'num_docs': 121635, 'num_queries': 123, 'number_of_characters': 283622456, 'min_document_length': 74, 'average_document_length': 2331.08, 'max_document_length': 24179, 'unique_docs': 121635, 'min_query_length': 10, 'average_query_length': 57.11, 'max_query_length': 136, 'unique_queries': 123, 'min_instruction_length': 37, 'average_instruction_length': 281.07, 'max_instruction_length': 1009, 'unique_instructions': 123, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 326.94, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 123, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 10.43, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000, 'hf_subset_descriptive_stats': {'fas': {'num_samples': 41229, 'num_docs': 41189, 'num_queries': 40, 'number_of_characters': 129593838, 'min_document_length': 99, 'average_document_length': 3145.5, 'max_document_length': 24179, 'unique_docs': 41189, 'min_query_length': 34, 'average_query_length': 72.65, 'max_query_length': 124, 'unique_queries': 40, 'min_instruction_length': 121, 'average_instruction_length': 358.93, 'max_instruction_length': 759, 'unique_instructions': 40, 'min_changed_instruction_length': 163, 'average_changed_instruction_length': 415.32, 'max_changed_instruction_length': 842, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.85, 'max_average_relevant_docs_per_query': 22, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'rus': {'num_samples': 39366, 'num_docs': 39326, 'num_queries': 40, 'number_of_characters': 109523683, 'min_document_length': 75, 'average_document_length': 2784.08, 'max_document_length': 24061, 'unique_docs': 39326, 'min_query_length': 26, 'average_query_length': 77.5, 'max_query_length': 136, 'unique_queries': 40, 'min_instruction_length': 78, 'average_instruction_length': 387.0, 'max_instruction_length': 1009, 'unique_instructions': 40, 'min_changed_instruction_length': 187, 'average_changed_instruction_length': 458.0, 'max_changed_instruction_length': 1083, 'unique_changed_instructions': 40, 'min_average_relevant_docs_per_query': 0, 'average_relevant_docs_per_query': 9.78, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}, 'zho': {'num_samples': 41163, 'num_docs': 41120, 'num_queries': 43, 'number_of_characters': 44504935, 'min_document_length': 74, 'average_document_length': 1082.05, 'max_document_length': 23840, 'unique_docs': 41120, 'min_query_length': 10, 'average_query_length': 23.7, 'max_query_length': 44, 'unique_queries': 43, 'min_instruction_length': 37, 'average_instruction_length': 110.09, 'max_instruction_length': 209, 'unique_instructions': 43, 'min_changed_instruction_length': 44, 'average_changed_instruction_length': 122.81, 'max_changed_instruction_length': 229, 'unique_changed_instructions': 43, 'min_average_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 10.65, 'max_average_relevant_docs_per_query': 24, 'min_average_top_ranked_per_query': 1000, 'average_top_ranked_per_query': 1000.0, 'max_average_top_ranked_per_query': 1000}}}} | From 23f626da41bf1bd58cfb4e2050d509990cf0b1ed Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 1 Feb 2025 15:31:09 +0000 Subject: [PATCH 261/275] 1.31.7 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d30dc99cea..444bbecb21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.6" +version = "1.31.7" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From f3526fc0b83cfb25989ec9ad405995bcad19b35d Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sat, 1 Feb 2025 16:46:08 +0100 Subject: [PATCH 262/275] docs: Updated citation for mteb(scandinavian) (#1914) fix: Updated citation for mteb(scandinavian) --- mteb/benchmarks/benchmarks.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index eddf9d76bb..b6d525fb49 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -421,13 +421,12 @@ def load_results( ), description="A curated selection of tasks coverering the Scandinavian languages; Danish, Swedish and Norwegian, including Bokmål and Nynorsk.", reference="https://kennethenevoldsen.github.io/scandinavian-embedding-benchmark/", - citation="""@misc{enevoldsen2024scandinavian, - title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, - author={Kenneth Enevoldsen and Márton Kardos and Niklas Muennighoff and Kristoffer Laigaard Nielbo}, - year={2024}, - eprint={2406.02396}, - archivePrefix={arXiv}, - primaryClass={cs.CL} + citation="""@inproceedings{enevoldsen2024scandinavian, + title={The Scandinavian Embedding Benchmarks: Comprehensive Assessment of Multilingual and Monolingual Text Embedding}, + author={Enevoldsen, Kenneth and Kardos, M{\'a}rton and Muennighoff, Niklas and Nielbo, Kristoffer}, + booktitle={Advances in Neural Information Processing Systems}, + year={2024}, + url={https://nips.cc/virtual/2024/poster/97869} }""", contacts=["KennethEnevoldsen", "x-tabdeveloping", "Samoed"], ) From 9c762da0332009375dc4d5a42aa770bd68d309a4 Mon Sep 17 00:00:00 2001 From: Pengfei He Date: Sat, 1 Feb 2025 07:49:22 -0800 Subject: [PATCH 263/275] fix: Add datasets in CodeRAG-Bench (#1595) * add three out of four datasets in CodeRAG-Bench * add verified CodeRAGStackoverflowPostsRetrieval dataset * clean up code and make some comments * fixed lint errors * addressed comments about code-rag datasets: fixed grammar and remove unnessary code and loop * roll back files which is not supposed to change * fixed the comments in split_by_first_newline() and make the methods private by adding a underscore prefix * refactor to use common args * update task descriptions * add entry in benchmarks * correct the alphanumeric order for the dataset * add in tasks.md * add in tasks.md * update task metadata * update importing path * fix lint errors * correct CodeRAG task metadata description field and id for stackoverflow-posts * fix error in test --------- Co-authored-by: Isaac Chung --- docs/tasks.md | 3 + mteb/benchmarks/benchmarks.py | 24 +++ mteb/tasks/Retrieval/__init__.py | 1 + mteb/tasks/Retrieval/code/CodeRAG.py | 272 +++++++++++++++++++++++++++ 4 files changed, 300 insertions(+) create mode 100644 mteb/tasks/Retrieval/code/CodeRAG.py diff --git a/docs/tasks.md b/docs/tasks.md index d6e5cc9bd1..ee88f341a0 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -149,6 +149,9 @@ The following tables give you an overview of the tasks in MTEB. | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 156266302, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'min_document_length': 127, 'average_document_length': 885.13, 'max_document_length': 32432, 'unique_documents': 66383, 'min_query_length': 2, 'average_query_length': 7344.18, 'max_query_length': 9403, 'unique_queries': 13277, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13277}} | | [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 260957682, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'min_document_length': 26, 'average_document_length': 144.85, 'max_document_length': 13851, 'unique_documents': 156526, 'min_query_length': 1, 'average_query_length': 7611.46, 'max_query_length': 11354, 'unique_queries': 31306, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 31306}} | +| ["CodeRAGLibraryDocumentationSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': 61198} | {'train': {'number_of_characters': 2571365, 'num_samples': 61198, 'num_queries': 30599, 'num_documents': 30599, 'min_document_length': 2, 'average_document_length': 82.03428216608386, 'max_document_length': 43706, 'unique_documents': 30599, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 30599, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 30599}} +| ["CodeRAGOnlineTutorials"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] |{'train': 153286} | {'train': {'number_of_characters': 4241139, 'num_samples': 153286, 'num_queries': 76643, 'num_documents': 76643, 'min_document_length': 3, 'average_document_length': 53.33628641885104, 'max_document_length': 221, 'unique_documents': 76643, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 76643, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 76643}} +| ["CodeRAGProgrammingSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': | 1972} {'train': {'number_of_characters': 80085, 'num_samples': 1972, 'num_queries': 986, 'num_documents': 986, 'min_document_length': 11, 'average_document_length': 79.22210953346855, 'max_document_length': 251, 'unique_documents': 986, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 986, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 986}} | [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1058035} | {'test': {'number_of_characters': 22407915, 'num_samples': 1058035, 'num_queries': 52561, 'num_documents': 1005474, 'min_document_length': 23, 'average_document_length': 20.29, 'max_document_length': 214210, 'unique_documents': 1005474, 'min_query_length': 2, 'average_query_length': 38.26, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 8792958, 'num_samples': 295570, 'num_queries': 14918, 'num_documents': 280652, 'min_document_length': 38, 'average_document_length': 29.33, 'max_document_length': 8326, 'unique_documents': 280652, 'min_query_length': 2, 'average_query_length': 37.63, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 1590642, 'num_samples': 68492, 'num_queries': 3291, 'num_documents': 65201, 'min_document_length': 40, 'average_document_length': 22.4, 'max_document_length': 214210, 'unique_documents': 65201, 'min_query_length': 2, 'average_query_length': 39.62, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 2264134, 'num_samples': 190857, 'num_queries': 8122, 'num_documents': 182735, 'min_document_length': 23, 'average_document_length': 10.39, 'max_document_length': 3589, 'unique_documents': 182735, 'min_query_length': 2, 'average_query_length': 45.0, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 391703, 'num_samples': 28849, 'num_queries': 1261, 'num_documents': 27588, 'min_document_length': 36, 'average_document_length': 12.2, 'max_document_length': 2244, 'unique_documents': 27588, 'min_query_length': 2, 'average_query_length': 43.76, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 4114584, 'num_samples': 192016, 'num_queries': 10955, 'num_documents': 181061, 'min_document_length': 38, 'average_document_length': 20.72, 'max_document_length': 5066, 'unique_documents': 181061, 'min_query_length': 2, 'average_query_length': 33.06, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 5253894, 'num_samples': 282251, 'num_queries': 14014, 'num_documents': 268237, 'min_document_length': 40, 'average_document_length': 17.59, 'max_document_length': 2995, 'unique_documents': 268237, 'min_query_length': 2, 'average_query_length': 38.28, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 1950074, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'min_document_length': 2, 'average_document_length': 324.01, 'max_document_length': 17533, 'unique_documents': 6000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 6000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467546, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 8, 'average_document_length': 466.55, 'max_document_length': 8636, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 187018, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 186.02, 'max_document_length': 7657, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 126213, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 14, 'average_document_length': 125.21, 'max_document_length': 1501, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 314818, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 313.82, 'max_document_length': 17533, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 691360, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 690.36, 'max_document_length': 6473, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 163119, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 162.12, 'max_document_length': 1240, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 1744286, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'min_document_length': 8, 'average_document_length': 221.9, 'max_document_length': 4147, 'unique_documents': 1008, 'min_query_length': 8, 'average_query_length': 6880.58, 'max_query_length': 10852, 'unique_queries': 221, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 221}} | diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index b6d525fb49..8d32fdc1ed 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1141,6 +1141,30 @@ def load_results( }""", ) + +CODE_RAG = Benchmark( + name="CodeRAG", + tasks=get_tasks( + tasks=[ + "CodeRAGLibraryDocumentationSolutions", + "CodeRAGOnlineTutorials", + "CodeRAGProgrammingSolutions", + "CodeRAGStackoverflowPosts", + ], + ), + description="A benchmark for evaluating code retrieval augmented generation, testing models' ability to retrieve relevant programming solutions, tutorials and documentation.", + reference="https://arxiv.org/abs/2406.14497", + citation="""@misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + }""", +) + NANOBEIR = Benchmark( name="NanoBEIR", tasks=get_tasks( diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 291dd983c3..06414da081 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -5,6 +5,7 @@ from .code.CodeEditSearchRetrieval import * from .code.CodeFeedbackMTRetrieval import * from .code.CodeFeedbackSTRetrieval import * +from .code.CodeRAG import * from .code.CodeSearchNetCCRetrieval import * from .code.CodeSearchNetRetrieval import * from .code.CodeTransOceanContestRetrieval import * diff --git a/mteb/tasks/Retrieval/code/CodeRAG.py b/mteb/tasks/Retrieval/code/CodeRAG.py new file mode 100644 index 0000000000..3724f44eca --- /dev/null +++ b/mteb/tasks/Retrieval/code/CodeRAG.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +def split_by_first_newline(s): + # Split the string by the first newline + parts = s.split("\n", 1) + # Return parts or (s, '') if no newline + return parts if len(parts) > 1 else (s, "") + + +common_args = { + "reference": "https://arxiv.org/pdf/2406.14497", + "type": "Reranking", + "category": "s2s", + "modalities": ["text"], + "eval_splits": ["train"], + "eval_langs": ["python-Code"], + "main_score": "ndcg_at_10", + "date": ("2024-06-02", "2024-06-02"), # best guess + "domains": ["Programming"], + "task_subtypes": ["Code retrieval"], + "license": "cc-by-sa-4.0", + "annotations_creators": "derived", + "dialect": [], + "sample_creation": "found", + "bibtex_citation": """ + @misc{wang2024coderagbenchretrievalaugmentcode, + title={CodeRAG-Bench: Can Retrieval Augment Code Generation?}, + author={Zora Zhiruo Wang and Akari Asai and Xinyan Velocity Yu and Frank F. Xu and Yiqing Xie and Graham Neubig and Daniel Fried}, + year={2024}, + eprint={2406.14497}, + archivePrefix={arXiv}, + primaryClass={cs.SE}, + url={https://arxiv.org/abs/2406.14497}, + } + """, +} + + +class CodeRAGProgrammingSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGProgrammingSolutions", + description="Evaluation of programming solution retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant programming solutions given code-related queries.", + dataset={ + "path": "code-rag-bench/programming-solutions", + "revision": "1064f7bba54d5400d4836f5831fe4c2332a566a6", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + meta = ds["meta"] + for text, mt in zip(texts, meta): + # in code-rag-bench, + # text = query + "\n" + doc(code) + query, doc = split_by_first_newline(text) + + id = mt["task_id"] + + query_id = id + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + +class CodeRAGOnlineTutorialsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGOnlineTutorials", + description="Evaluation of online programming tutorial retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant tutorials from online platforms given code-related queries.", + dataset={ + "path": "code-rag-bench/online-tutorials", + "revision": "095bb77130082e4690d6c3a031997b03487bf6e2", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + titles = ds["title"] + texts = ds["text"] + parsed = ds["parsed"] + id = 0 + for title, text, mt in zip(titles, texts, parsed): + # in code-rag-bench, + # query=doc(code) + # text=query+doc(code) + query, doc = title, text + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + + id += 1 + + +class CodeRAGLibraryDocumentationSolutionsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGLibraryDocumentationSolutions", + description="Evaluation of code library documentation retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant Python library documentation sections given code-related queries.", + dataset={ + "path": "code-rag-bench/library-documentation", + "revision": "b530d3b5a25087d2074e731b76232db85b9e9107", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["doc_content"] + + id = 0 + for text in texts: + # text format "document title \n document content" + query, doc = split_by_first_newline(text) + + # some library documents doesn't have query-doc pair + if not doc: + continue + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + # only one correct match + self.relevant_docs[split][query_id] = {doc_id: 1} + id += 1 + + +class CodeRAGStackoverflowPostsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CodeRAGStackoverflowPosts", + description="Evaluation of StackOverflow post retrieval using CodeRAG-Bench. Tests the ability to retrieve relevant StackOverflow posts given code-related queries.", + dataset={ + "path": "code-rag-bench/stackoverflow-posts", + "revision": "04e05d86cb0ac467b29a5d87f4c56eac99dfc0a4", + }, + **common_args, # type: ignore + ) + + def load_data(self, **kwargs): + """Load dataset from HuggingFace hub""" + if self.data_loaded: + return + self.dataset = datasets.load_dataset(**self.metadata.dataset) # type: ignore + self.dataset_transform() + self.data_loaded = True + + def dataset_transform(self) -> None: + """And transform to a retrieval datset, which have the following attributes + + self.corpus = Dict[doc_id, Dict[str, str]] #id => dict with document datas like title and text + self.queries = Dict[query_id, str] #id => query + self.relevant_docs = Dict[query_id, Dict[[doc_id, score]] + """ + self.corpus = {} + self.relevant_docs = {} + self.queries = {} + + split = self.metadata.eval_splits[0] + ds: datasets.Dataset = self.dataset[split] # type: ignore + ds = ds.shuffle(seed=42) + + self.queries[split] = {} + self.relevant_docs[split] = {} + self.corpus[split] = {} + + texts = ds["text"] + id = 0 + for text in texts: + # in code-rag-bench, + # text = query + "\n" + doc + query, doc = split_by_first_newline(text) + + query_id = str(id) + doc_id = f"doc_{id}" + self.queries[split][query_id] = query + self.corpus[split][doc_id] = {"title": "", "text": doc} + + self.relevant_docs[split][query_id] = { + doc_id: 1 + } # only one correct matches + id += 1 From 57db0f9492928c4653d37b1699cb86223c894517 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sat, 1 Feb 2025 15:51:26 +0000 Subject: [PATCH 264/275] Update tasks table --- docs/tasks.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/tasks.md b/docs/tasks.md index ee88f341a0..20be22bee2 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -149,9 +149,10 @@ The following tables give you an overview of the tasks in MTEB. | [CodeEditSearchRetrieval](https://huggingface.co/datasets/cassanof/CodeEditSearch/viewer) (Niklas Muennighoff, 2023) | ['c', 'c++', 'go', 'java', 'javascript', 'php', 'python', 'ruby', 'rust', 'scala', 'shell', 'swift', 'typescript'] | Retrieval | p2p | [Programming, Written] | {'train': 26000} | {'train': {'number_of_characters': 935841, 'num_samples': 26000, 'num_queries': 13000, 'num_documents': 13000, 'min_document_length': 18, 'average_document_length': 70.99, 'max_document_length': 2532, 'unique_documents': 13000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 13000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 70519, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 69.52, 'max_document_length': 1811, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 57880, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 56.88, 'max_document_length': 601, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'typescript': {'number_of_characters': 61092, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 60.09, 'max_document_length': 659, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 71797, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 70.8, 'max_document_length': 1529, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 67900, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 66.9, 'max_document_length': 751, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 63984, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 23, 'average_document_length': 62.98, 'max_document_length': 807, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 62927, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 21, 'average_document_length': 61.93, 'max_document_length': 766, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c': {'number_of_characters': 98588, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 20, 'average_document_length': 97.59, 'max_document_length': 1672, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'c++': {'number_of_characters': 115480, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 114.48, 'max_document_length': 1856, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'rust': {'number_of_characters': 68503, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 67.5, 'max_document_length': 2532, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'swift': {'number_of_characters': 58279, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 19, 'average_document_length': 57.28, 'max_document_length': 727, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'scala': {'number_of_characters': 65833, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 22, 'average_document_length': 64.83, 'max_document_length': 685, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'shell': {'number_of_characters': 73059, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 18, 'average_document_length': 72.06, 'max_document_length': 813, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeFeedbackMT](https://arxiv.org/abs/2402.14658) (Tianyu Zheng, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 79660} | {'test': {'number_of_characters': 156266302, 'num_samples': 79660, 'num_queries': 13277, 'num_documents': 66383, 'min_document_length': 127, 'average_document_length': 885.13, 'max_document_length': 32432, 'unique_documents': 66383, 'min_query_length': 2, 'average_query_length': 7344.18, 'max_query_length': 9403, 'unique_queries': 13277, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 13277}} | | [CodeFeedbackST](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'test': 187832} | {'test': {'number_of_characters': 260957682, 'num_samples': 187832, 'num_queries': 31306, 'num_documents': 156526, 'min_document_length': 26, 'average_document_length': 144.85, 'max_document_length': 13851, 'unique_documents': 156526, 'min_query_length': 1, 'average_query_length': 7611.46, 'max_query_length': 11354, 'unique_queries': 31306, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 31306}} | -| ["CodeRAGLibraryDocumentationSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': 61198} | {'train': {'number_of_characters': 2571365, 'num_samples': 61198, 'num_queries': 30599, 'num_documents': 30599, 'min_document_length': 2, 'average_document_length': 82.03428216608386, 'max_document_length': 43706, 'unique_documents': 30599, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 30599, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 30599}} -| ["CodeRAGOnlineTutorials"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] |{'train': 153286} | {'train': {'number_of_characters': 4241139, 'num_samples': 153286, 'num_queries': 76643, 'num_documents': 76643, 'min_document_length': 3, 'average_document_length': 53.33628641885104, 'max_document_length': 221, 'unique_documents': 76643, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 76643, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 76643}} -| ["CodeRAGProgrammingSolutions"](https://arxiv.org/abs/2406.14497) (Zhiruo Wang, 2024) | ['eng'] | Retrieval | p2p | [Programming, Written] | {'train': | 1972} {'train': {'number_of_characters': 80085, 'num_samples': 1972, 'num_queries': 986, 'num_documents': 986, 'min_document_length': 11, 'average_document_length': 79.22210953346855, 'max_document_length': 251, 'unique_documents': 986, 'min_query_length': 2, 'average_query_length': 2.0, 'max_query_length': 2, 'unique_queries': 986, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 986}} +| [CodeRAGLibraryDocumentationSolutions](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGOnlineTutorials](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGProgrammingSolutions](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | +| [CodeRAGStackoverflowPosts](https://arxiv.org/pdf/2406.14497) (Zora Zhiruo Wang, 2024) | ['python'] | Reranking | s2s | [Programming] | None | None | | [CodeSearchNetCCRetrieval](https://arxiv.org/abs/2407.02883) (Xiangyang Li, 2024) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 1058035} | {'test': {'number_of_characters': 22407915, 'num_samples': 1058035, 'num_queries': 52561, 'num_documents': 1005474, 'min_document_length': 23, 'average_document_length': 20.29, 'max_document_length': 214210, 'unique_documents': 1005474, 'min_query_length': 2, 'average_query_length': 38.26, 'max_query_length': 2, 'unique_queries': 52561, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 52561, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 8792958, 'num_samples': 295570, 'num_queries': 14918, 'num_documents': 280652, 'min_document_length': 38, 'average_document_length': 29.33, 'max_document_length': 8326, 'unique_documents': 280652, 'min_query_length': 2, 'average_query_length': 37.63, 'max_query_length': 2, 'unique_queries': 14918, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14918}, 'javascript': {'number_of_characters': 1590642, 'num_samples': 68492, 'num_queries': 3291, 'num_documents': 65201, 'min_document_length': 40, 'average_document_length': 22.4, 'max_document_length': 214210, 'unique_documents': 65201, 'min_query_length': 2, 'average_query_length': 39.62, 'max_query_length': 2, 'unique_queries': 3291, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 3291}, 'go': {'number_of_characters': 2264134, 'num_samples': 190857, 'num_queries': 8122, 'num_documents': 182735, 'min_document_length': 23, 'average_document_length': 10.39, 'max_document_length': 3589, 'unique_documents': 182735, 'min_query_length': 2, 'average_query_length': 45.0, 'max_query_length': 2, 'unique_queries': 8122, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 8122}, 'ruby': {'number_of_characters': 391703, 'num_samples': 28849, 'num_queries': 1261, 'num_documents': 27588, 'min_document_length': 36, 'average_document_length': 12.2, 'max_document_length': 2244, 'unique_documents': 27588, 'min_query_length': 2, 'average_query_length': 43.76, 'max_query_length': 2, 'unique_queries': 1261, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1261}, 'java': {'number_of_characters': 4114584, 'num_samples': 192016, 'num_queries': 10955, 'num_documents': 181061, 'min_document_length': 38, 'average_document_length': 20.72, 'max_document_length': 5066, 'unique_documents': 181061, 'min_query_length': 2, 'average_query_length': 33.06, 'max_query_length': 2, 'unique_queries': 10955, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 10955}, 'php': {'number_of_characters': 5253894, 'num_samples': 282251, 'num_queries': 14014, 'num_documents': 268237, 'min_document_length': 40, 'average_document_length': 17.59, 'max_document_length': 2995, 'unique_documents': 268237, 'min_query_length': 2, 'average_query_length': 38.28, 'max_query_length': 2, 'unique_queries': 14014, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 14014}}}} | | [CodeSearchNetRetrieval](https://huggingface.co/datasets/code_search_net/) (Husain et al., 2019) | ['go', 'java', 'javascript', 'php', 'python', 'ruby'] | Retrieval | p2p | [Programming, Written] | {'test': 12000} | {'test': {'number_of_characters': 1950074, 'num_samples': 12000, 'num_queries': 6000, 'num_documents': 6000, 'min_document_length': 2, 'average_document_length': 324.01, 'max_document_length': 17533, 'unique_documents': 6000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 6000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 6000, 'hf_subset_descriptive_stats': {'python': {'number_of_characters': 467546, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 8, 'average_document_length': 466.55, 'max_document_length': 8636, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'javascript': {'number_of_characters': 187018, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 186.02, 'max_document_length': 7657, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'go': {'number_of_characters': 126213, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 14, 'average_document_length': 125.21, 'max_document_length': 1501, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'ruby': {'number_of_characters': 314818, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 313.82, 'max_document_length': 17533, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'java': {'number_of_characters': 691360, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 2, 'average_document_length': 690.36, 'max_document_length': 6473, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}, 'php': {'number_of_characters': 163119, 'num_samples': 2000, 'num_queries': 1000, 'num_documents': 1000, 'min_document_length': 5, 'average_document_length': 162.12, 'max_document_length': 1240, 'unique_documents': 1000, 'min_query_length': 1, 'average_query_length': 1.0, 'max_query_length': 1, 'unique_queries': 1000, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 1000}}}} | | [CodeTransOceanContest](https://arxiv.org/abs/2310.04951) (Weixiang Yan, 2023) | ['c++', 'python'] | Retrieval | p2p | [Programming, Written] | {'test': 1229} | {'test': {'number_of_characters': 1744286, 'num_samples': 1229, 'num_queries': 221, 'num_documents': 1008, 'min_document_length': 8, 'average_document_length': 221.9, 'max_document_length': 4147, 'unique_documents': 1008, 'min_query_length': 8, 'average_query_length': 6880.58, 'max_query_length': 10852, 'unique_queries': 221, 'min_relevant_docs_per_query': 1, 'average_relevant_docs_per_query': 1.0, 'max_relevant_docs_per_query': 1, 'unique_relevant_docs': 221}} | @@ -944,7 +945,7 @@ The following tables give you an overview of the tasks in MTEB. | cni | Asháninka | Arawakan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cnl | Lalana Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cnt | Tepetotutla Chinantec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 37 | 0 | 0 | 0 | 37 | +| code | unknown | Unclassified | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 37 | 0 | 0 | 0 | 41 | | cof | Colorado | Barbacoan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | con | Cofán | Unclassified | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | cop | Coptic | Afro-Asiatic | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1779,7 +1780,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 1398 | 836 | 311 | 3 | 28 | 91 | 51 | 507 | 88 | 2 | 2 | +| Total | None | None | None | 1398 | 836 | 311 | 3 | 28 | 91 | 55 | 507 | 88 | 2 | 2 | From dba7a952e39a8d4a53aab967d761354ae030ca54 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 1 Feb 2025 16:03:15 +0000 Subject: [PATCH 265/275] 1.31.8 Automatically generated by python-semantic-release --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 444bbecb21..1ccdad72db 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.31.7" +version = "1.31.8" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ From 9b5226882f9f5d2f15d1d1043473bd7a7a669c5e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 16:26:45 +0300 Subject: [PATCH 266/275] update __init__ --- mteb/tasks/BitextMining/__init__.py | 74 +- mteb/tasks/BitextMining/eng/__init__.py | 5 + mteb/tasks/Classification/__init__.py | 781 +++++++-- mteb/tasks/Classification/eng/__init__.py | 58 + mteb/tasks/Classification/fas/__init__.py | 52 +- mteb/tasks/Clustering/__init__.py | 239 ++- mteb/tasks/Clustering/eng/__init__.py | 6 + mteb/tasks/Clustering/fas/__init__.py | 17 + mteb/tasks/PairClassification/__init__.py | 109 +- mteb/tasks/PairClassification/eng/__init__.py | 8 + mteb/tasks/PairClassification/fas/__init__.py | 20 +- .../multilingual/__init__.py | 2 + mteb/tasks/Retrieval/__init__.py | 609 +++++-- mteb/tasks/Retrieval/code/__init__.py | 10 + mteb/tasks/Retrieval/eng/__init__.py | 4 + mteb/tasks/Retrieval/fas/__init__.py | 69 + mteb/tasks/STS/__init__.py | 104 +- mteb/tasks/STS/fas/__init__.py | 5 + mteb/tasks/SummaryRetrieval/__init__.py | 4 +- mteb/tasks/SummaryRetrieval/fas/__init__.py | 9 + mteb/tasks/__init__.py | 1549 ++++++++++++++++- mteb/tasks/aggregated_tasks/__init__.py | 12 +- tests/test_tasks/test_all_abstasks.py | 5 +- 23 files changed, 3281 insertions(+), 470 deletions(-) diff --git a/mteb/tasks/BitextMining/__init__.py b/mteb/tasks/BitextMining/__init__.py index 1cec5d5ddc..790505709f 100644 --- a/mteb/tasks/BitextMining/__init__.py +++ b/mteb/tasks/BitextMining/__init__.py @@ -1,25 +1,53 @@ from __future__ import annotations -from .dan.BornholmskBitextMining import * -from .eng.PubChemSMILESBitextMining import * -from .kat.TbilisiCityHallBitextMining import * -from .multilingual.BibleNLPBitextMining import * -from .multilingual.BUCCBitextMining import * -from .multilingual.BUCCBitextMiningFast import * -from .multilingual.DiaBLaBitextMining import * -from .multilingual.FloresBitextMining import * -from .multilingual.IN22ConvBitextMining import * -from .multilingual.IN22GenBitextMining import * -from .multilingual.IndicGenBenchFloresBitextMining import * -from .multilingual.IWSLT2017BitextMining import * -from .multilingual.LinceMTBitextMining import * -from .multilingual.NollySentiBitextMining import * -from .multilingual.NorwegianCourtsBitextMining import * -from .multilingual.NTREXBitextMining import * -from .multilingual.NusaTranslationBitextMining import * -from .multilingual.NusaXBitextMining import * -from .multilingual.PhincBitextMining import * -from .multilingual.RomaTalesBitextMining import * -from .multilingual.TatoebaBitextMining import * -from .srn.SRNCorpusBitextMining import * -from .vie.VieMedEVBitextMining import * +from .dan import BornholmBitextMining +from .eng import PubChemSMILESBitextMining +from .kat import TbilisiCityHallBitextMining +from .multilingual import ( + BibleNLPBitextMining, + BUCCBitextMining, + BUCCBitextMiningFast, + DiaBLaBitextMining, + FloresBitextMining, + IN22ConvBitextMining, + IN22GenBitextMining, + IndicGenBenchFloresBitextMining, + IWSLT2017BitextMining, + LinceMTBitextMining, + NollySentiBitextMining, + NorwegianCourtsBitextMining, + NTREXBitextMining, + NusaTranslationBitextMining, + NusaXBitextMining, + PhincBitextMining, + RomaTalesBitextMining, + TatoebaBitextMining, +) +from .srn import SRNCorpusBitextMining +from .vie import VieMedEVBitextMining + +__all__ = [ + "TbilisiCityHallBitextMining", + "VieMedEVBitextMining", + "BornholmBitextMining", + "SRNCorpusBitextMining", + "PubChemSMILESBitextMining", + "IN22ConvBitextMining", + "IN22GenBitextMining", + "BUCCBitextMining", + "LinceMTBitextMining", + "NusaTranslationBitextMining", + "DiaBLaBitextMining", + "NTREXBitextMining", + "IndicGenBenchFloresBitextMining", + "NollySentiBitextMining", + "BUCCBitextMiningFast", + "PhincBitextMining", + "TatoebaBitextMining", + "NusaXBitextMining", + "IWSLT2017BitextMining", + "BibleNLPBitextMining", + "FloresBitextMining", + "RomaTalesBitextMining", + "NorwegianCourtsBitextMining", +] diff --git a/mteb/tasks/BitextMining/eng/__init__.py b/mteb/tasks/BitextMining/eng/__init__.py index e69de29bb2..dc070b379f 100644 --- a/mteb/tasks/BitextMining/eng/__init__.py +++ b/mteb/tasks/BitextMining/eng/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .PubChemSMILESBitextMining import PubChemSMILESBitextMining + +__all__ = ["PubChemSMILESBitextMining"] diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index b2aab22714..40fae96f72 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -1,162 +1,625 @@ from __future__ import annotations -from .ara.AJGT import * -from .ara.HotelReviewSentimentClassification import * -from .ara.OnlineStoreReviewSentimentClassification import * -from .ara.RestaurantReviewSentimentClassification import * -from .ara.TweetEmotionClassification import * -from .ara.TweetSarcasmClassification import * -from .ben.BengaliDocumentClassification import * -from .ben.BengaliHateSpeechClassification import * -from .ben.BengaliSentimentAnalysis import * -from .bul.BulgarianStoreReviewSentimentClassfication import * -from .ces.CSFDCZMovieReviewSentimentClassification import * -from .ces.CzechProductReviewSentimentClassification import * -from .ces.CzechSoMeSentimentClassification import * -from .ces.CzechSubjectivityClassification import * -from .dan.AngryTweetsClassification import * -from .dan.DanishPoliticalCommentsClassification import * -from .dan.DKHateClassification import * -from .dan.LccSentimentClassification import * -from .deu.GermanPoliticiansTwitterSentimentClassification import * -from .deu.TenKGnadClassification import * -from .ell.GreekLegalCodeClassification import * -from .eng.AmazonPolarityClassification import * -from .eng.ArxivClassification import * -from .eng.Banking77Classification import * -from .eng.DBpediaClassification import * -from .eng.EmotionClassification import * -from .eng.FinancialPhrasebankClassification import * -from .eng.FrenkEnClassification import * -from .eng.ImdbClassification import * -from .eng.LegalBenchClassification import * -from .eng.NewsClassification import * -from .eng.PatentClassification import * -from .eng.PoemSentimentClassification import * -from .eng.SDSEyeProtectionClassification import * -from .eng.SDSGlovesClassification import * -from .eng.ToxicChatClassification import * -from .eng.ToxicConversationsClassification import * -from .eng.TweetSentimentExtractionClassification import * -from .eng.TweetTopicSingleClassification import * -from .eng.WikipediaBiolumNeurochemClassification import * -from .eng.WikipediaBioMetChemClassification import * -from .eng.WikipediaChemEngSpecialtiesClassification import * -from .eng.WikipediaChemFieldsClassification import * -from .eng.WikipediaChemistryTopicsClassification import * -from .eng.WikipediaCompChemSpectroscopyClassification import * -from .eng.WikipediaCryobiologySeparationClassification import * -from .eng.WikipediaCrystallographyAnalyticalClassification import * -from .eng.WikipediaGreenhouseEnantiopureClassification import * -from .eng.WikipediaIsotopesFissionClassification import * -from .eng.WikipediaLuminescenceClassification import * -from .eng.WikipediaOrganicInorganicClassification import * -from .eng.WikipediaSaltsSemiconductorsClassification import * -from .eng.WikipediaSolidStateColloidalClassification import * -from .eng.WikipediaTheoreticalAppliedClassification import * -from .eng.YahooAnswersTopicsClassification import * -from .eng.YelpReviewFullClassification import * -from .est.estonian_valence import * -from .fas.FaMTEBClassification import * -from .fas.PersianFoodSentimentClassification import * -from .fil.FilipinoHateSpeechClassification import * -from .fil.FilipinoShopeeReviewsClassification import * -from .fin.FinToxicityClassification import * -from .fra.FrenchBookReviews import * -from .fra.MovieReviewSentimentClassification import * -from .guj.GujaratiNewsClassification import * -from .heb.HebrewSentimentAnalysis import * -from .hin.HindiDiscourseClassification import * -from .hin.SentimentAnalysisHindi import * -from .hrv.FrenkHrClassification import * -from .ind.IndonesianIdClickbaitClassification import * -from .ind.IndonesianMongabayConservationClassification import * -from .ita.ItaCaseholdClassification import * -from .ita.ItalianLinguistAcceptabilityClassification import * -from .jav.JavaneseIMDBClassification import * -from .jpn.WRIMEClassification import * -from .kan.KannadaNewsClassification import * -from .kor.KlueTC import * -from .kor.KorFin import * -from .kor.KorHateClassification import * -from .kor.KorSarcasmClassification import * -from .kur.KurdishSentimentClassification import * -from .mal.MalayalamNewsClassification import * -from .mar.MarathiNewsClassification import * -from .mkd.MacedonianTweetSentimentClassification import * -from .multilingual.AfriSentiClassification import * -from .multilingual.AfriSentiLangClassification import * -from .multilingual.AmazonCounterfactualClassification import * -from .multilingual.AmazonReviewsClassification import * -from .multilingual.CataloniaTweetClassification import * -from .multilingual.CyrillicTurkicLangClassification import * -from .multilingual.HinDialectClassification import * -from .multilingual.IndicLangClassification import * -from .multilingual.IndicNLPNewsClassification import * -from .multilingual.IndicSentimentClassification import * -from .multilingual.LanguageClassification import * -from .multilingual.MasakhaNEWSClassification import * -from .multilingual.MassiveIntentClassification import * -from .multilingual.MassiveScenarioClassification import * -from .multilingual.MTOPDomainClassification import * -from .multilingual.MTOPIntentClassification import * -from .multilingual.MultiHateClassification import * -from .multilingual.MultilingualSentimentClassification import * -from .multilingual.NaijaSenti import * -from .multilingual.NordicLangClassification import * -from .multilingual.NusaParagraphEmotionClassification import * -from .multilingual.NusaParagraphTopicClassification import * -from .multilingual.NusaXSenti import * -from .multilingual.ScalaClassification import * -from .multilingual.SIB200Classification import * -from .multilingual.SouthAfricanLangClassification import * -from .multilingual.SwissJudgementClassification import * -from .multilingual.TurkicClassification import * -from .multilingual.TweetSentimentClassification import * -from .mya.MyanmarNews import * -from .nep.NepaliNewsClassification import * -from .nld.DutchBookReviewSentimentClassification import * -from .nob.NoRecClassification import * -from .nob.NorwegianParliamentClassification import * -from .ory.OdiaNewsClassification import * -from .pan.PunjabiNewsClassification import * -from .pol.PolishClassification import * -from .por.HateSpeechPortugueseClassification import * -from .ron.Moroco import * -from .ron.RomanianReviewsSentiment import * -from .ron.RomanianSentimentClassification import * -from .rus.GeoreviewClassification import * -from .rus.HeadlineClassification import * -from .rus.InappropriatenessClassification import * -from .rus.KinopoiskClassification import * -from .rus.RuReviewsClassification import * -from .rus.RuSciBenchGRNTIClassification import * -from .rus.RuSciBenchOECDClassification import * -from .san.SanskritShlokasClassification import * -from .sin.SinhalaNewsClassification import * -from .sin.SinhalaNewsSourceClassification import * -from .slk.CSFDSKMovieReviewSentimentClassification import * -from .slk.SlovakHateSpeechClassification import * -from .slv.FrenkSlClassification import * -from .spa.SpanishNewsClassification import * -from .spa.SpanishSentimentClassification import * -from .ssw.SiswatiNewsClassification import * -from .svk.SlovakMovieReviewSentimentClassification import * -from .swa.SwahiliNewsClassification import * -from .swe.DalajClassification import * -from .swe.SwedishSentimentClassification import * -from .swe.SweRecClassification import * -from .tam.TamilNewsClassification import * -from .tel.TeluguAndhraJyotiNewsClassification import * -from .tha.WisesightSentimentClassification import * -from .tsn.TswanaNewsClassification import * -from .tur.TurkishMovieSentimentClassification import * -from .tur.TurkishProductSentimentClassification import * -from .ukr.UkrFormalityClassification import * -from .urd.UrduRomanSentimentClassification import * -from .vie.VieStudentFeedbackClassification import * -from .zho.CMTEBClassification import * -from .zho.YueOpenriceReviewClassification import ( - YueOpenriceReviewClassification, # noqa: F401 +from .ara import ( + AJGT, + HotelReviewSentimentClassification, + OnlineStoreReviewSentimentClassification, + RestaurantReviewSentimentClassification, + TweetEmotionClassification, + TweetSarcasmClassification, ) -from .zul.IsiZuluNewsClassification import * +from .ben import ( + BengaliDocumentClassification, + BengaliHateSpeechClassification, + BengaliSentimentAnalysis, +) +from .bul import BulgarianStoreReviewSentimentClassfication +from .ces import ( + CSFDCZMovieReviewSentimentClassification, + CzechProductReviewSentimentClassification, + CzechSoMeSentimentClassification, + CzechSubjectivityClassification, +) +from .dan import ( + AngryTweetsClassification, + DanishPoliticalCommentsClassification, + DdiscoCohesionClassification, + DKHateClassification, + LccSentimentClassification, +) +from .deu import GermanPoliticiansTwitterSentimentClassification, TenKGnadClassification +from .ell import GreekLegalCodeClassification +from .eng import ( + AmazonPolarityClassification, + ArxivClassification, + Banking77Classification, + CanadaTaxCourtOutcomesLegalBenchClassification, + ContractNLIConfidentialityOfAgreementLegalBenchClassification, + ContractNLIExplicitIdentificationLegalBenchClassification, + ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification, + ContractNLILimitedUseLegalBenchClassification, + ContractNLINoLicensingLegalBenchClassification, + ContractNLINoticeOnCompelledDisclosureLegalBenchClassification, + ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification, + ContractNLIPermissibleCopyLegalBenchClassification, + ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification, + ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification, + ContractNLIReturnOfConfidentialInformationLegalBenchClassification, + ContractNLISharingWithEmployeesLegalBenchClassification, + ContractNLISharingWithThirdPartiesLegalBenchClassification, + ContractNLISurvivalOfObligationsLegalBenchClassification, + CorporateLobbyingLegalBenchClassification, + CUADAffiliateLicenseLicenseeLegalBenchClassification, + CUADAffiliateLicenseLicensorLegalBenchClassification, + CUADAntiAssignmentLegalBenchClassification, + CUADAuditRightsLegalBenchClassification, + CUADCapOnLiabilityLegalBenchClassification, + CUADChangeOfControlLegalBenchClassification, + CUADCompetitiveRestrictionExceptionLegalBenchClassification, + CUADCovenantNotToSueLegalBenchClassification, + CUADEffectiveDateLegalBenchClassification, + CUADExclusivityLegalBenchClassification, + CUADExpirationDateLegalBenchClassification, + CUADGoverningLawLegalBenchClassification, + CUADInsuranceLegalBenchClassification, + CUADIPOwnershipAssignmentLegalBenchClassification, + CUADIrrevocableOrPerpetualLicenseLegalBenchClassification, + CUADJointIPOwnershipLegalBenchClassification, + CUADLicenseGrantLegalBenchClassification, + CUADLiquidatedDamagesLegalBenchClassification, + CUADMinimumCommitmentLegalBenchClassification, + CUADMostFavoredNationLegalBenchClassification, + CUADNonCompeteLegalBenchClassification, + CUADNonDisparagementLegalBenchClassification, + CUADNonTransferableLicenseLegalBenchClassification, + CUADNoSolicitOfCustomersLegalBenchClassification, + CUADNoSolicitOfEmployeesLegalBenchClassification, + CUADNoticePeriodToTerminateRenewalLegalBenchClassification, + CUADPostTerminationServicesLegalBenchClassification, + CUADPriceRestrictionsLegalBenchClassification, + CUADRenewalTermLegalBenchClassification, + CUADRevenueProfitSharingLegalBenchClassification, + CUADRofrRofoRofnLegalBenchClassification, + CUADSourceCodeEscrowLegalBenchClassification, + CUADTerminationForConvenienceLegalBenchClassification, + CUADThirdPartyBeneficiaryLegalBenchClassification, + CUADUncappedLiabilityLegalBenchClassification, + CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification, + CUADVolumeRestrictionLegalBenchClassification, + CUADWarrantyDurationLegalBenchClassification, + DBpediaClassification, + DefinitionClassificationLegalBenchClassification, + Diversity1LegalBenchClassification, + Diversity2LegalBenchClassification, + Diversity3LegalBenchClassification, + Diversity4LegalBenchClassification, + Diversity5LegalBenchClassification, + Diversity6LegalBenchClassification, + EmotionClassification, + FinancialPhrasebankClassification, + FrenkEnClassification, + FunctionOfDecisionSectionLegalBenchClassification, + ImdbClassification, + InsurancePolicyInterpretationLegalBenchClassification, + InternationalCitizenshipQuestionsLegalBenchClassification, + JCrewBlockerLegalBenchClassification, + LearnedHandsBenefitsLegalBenchClassification, + LearnedHandsBusinessLegalBenchClassification, + LearnedHandsConsumerLegalBenchClassification, + LearnedHandsCourtsLegalBenchClassification, + LearnedHandsCrimeLegalBenchClassification, + LearnedHandsDivorceLegalBenchClassification, + LearnedHandsDomesticViolenceLegalBenchClassification, + LearnedHandsEducationLegalBenchClassification, + LearnedHandsEmploymentLegalBenchClassification, + LearnedHandsEstatesLegalBenchClassification, + LearnedHandsFamilyLegalBenchClassification, + LearnedHandsHealthLegalBenchClassification, + LearnedHandsHousingLegalBenchClassification, + LearnedHandsImmigrationLegalBenchClassification, + LearnedHandsTortsLegalBenchClassification, + LearnedHandsTrafficLegalBenchClassification, + LegalReasoningCausalityLegalBenchClassification, + MAUDLegalBenchClassification, + NewsClassification, + NYSJudicialEthicsLegalBenchClassification, + OPP115DataRetentionLegalBenchClassification, + OPP115DataSecurityLegalBenchClassification, + OPP115DoNotTrackLegalBenchClassification, + OPP115FirstPartyCollectionUseLegalBenchClassification, + OPP115InternationalAndSpecificAudiencesLegalBenchClassification, + OPP115PolicyChangeLegalBenchClassification, + OPP115ThirdPartySharingCollectionLegalBenchClassification, + OPP115UserAccessEditAndDeletionLegalBenchClassification, + OPP115UserChoiceControlLegalBenchClassification, + OralArgumentQuestionPurposeLegalBenchClassification, + OverrulingLegalBenchClassification, + PatentClassification, + PersonalJurisdictionLegalBenchClassification, + PoemSentimentClassification, + PROALegalBenchClassification, + SCDBPAccountabilityLegalBenchClassification, + SCDBPAuditsLegalBenchClassification, + SCDBPCertificationLegalBenchClassification, + SCDBPTrainingLegalBenchClassification, + SCDBPVerificationLegalBenchClassification, + SCDDAccountabilityLegalBenchClassification, + SCDDAuditsLegalBenchClassification, + SCDDCertificationLegalBenchClassification, + SCDDTrainingLegalBenchClassification, + SCDDVerificationLegalBenchClassification, + SDSEyeProtectionClassification, + SDSGlovesClassification, + TelemarketingSalesRuleLegalBenchClassification, + TextualismToolDictionariesLegalBenchClassification, + TextualismToolPlainLegalBenchClassification, + ToxicChatClassification, + ToxicConversationsClassification, + TweetSentimentExtractionClassification, + TweetTopicSingleClassification, + UCCVCommonLawLegalBenchClassification, + UnfairTOSLegalBenchClassification, + WikipediaBiolumNeurochemClassification, + WikipediaBioMetChemClassification, + WikipediaChemEngSpecialtiesClassification, + WikipediaChemFieldsClassification, + WikipediaChemistryTopicsClassification, + WikipediaCompChemSpectroscopyClassification, + WikipediaCryobiologySeparationClassification, + WikipediaCrystallographyAnalyticalClassification, + WikipediaGreenhouseEnantiopureClassification, + WikipediaIsotopesFissionClassification, + WikipediaLuminescenceClassification, + WikipediaOrganicInorganicClassification, + WikipediaSaltsSemiconductorsClassification, + WikipediaSolidStateColloidalClassification, + WikipediaTheoreticalAppliedClassification, + YahooAnswersTopicsClassification, + YelpReviewFullClassification, +) +from .est import EstonianValenceClassification +from .fas import ( + DeepSentiPers, + DigikalamagClassification, + NLPTwitterAnalysisClassification, + PersianFoodSentimentClassification, + PersianTextEmotion, + PersianTextTone, + SentimentDKSF, + SIDClassification, + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, + SynPerChatbotConvSAToneChatbotClassification, + SynPerChatbotConvSAToneUserClassification, + SynPerChatbotRAGToneChatbotClassification, + SynPerChatbotRAGToneUserClassification, + SynPerChatbotSatisfactionLevelClassification, + SynPerChatbotToneChatbotClassification, + SynPerChatbotToneUserClassification, +) +from .fil import FilipinoHateSpeechClassification, FilipinoShopeeReviewsClassification +from .fin import FinToxicityClassification +from .fra import FrenchBookReviews, MovieReviewSentimentClassification +from .guj import GujaratiNewsClassification +from .heb import HebrewSentimentAnalysis +from .hin import HindiDiscourseClassification, SentimentAnalysisHindi +from .hrv import FrenkHrClassification +from .ind import ( + IndonesianIdClickbaitClassification, + IndonesianMongabayConservationClassification, +) +from .ita import ItaCaseholdClassification, ItalianLinguisticAcceptabilityClassification +from .jav import JavaneseIMDBClassification +from .jpn import WRIMEClassification +from .kan import KannadaNewsClassification +from .kat import GeorgianSentimentClassification +from .kor import KlueTC, KorFin, KorHateClassification, KorSarcasmClassification +from .kur import KurdishSentimentClassification +from .mal import MalayalamNewsClassification +from .mar import MarathiNewsClassification +from .mkd import MacedonianTweetSentimentClassification +from .multilingual import ( + AfriSentiClassification, + AfriSentiLangClassification, + AmazonCounterfactualClassification, + AmazonReviewsClassification, + CataloniaTweetClassification, + CyrillicTurkicLangClassification, + HinDialectClassification, + IndicLangClassification, + IndicNLPNewsClassification, + IndicSentimentClassification, + LanguageClassification, + MasakhaNEWSClassification, + MassiveIntentClassification, + MassiveScenarioClassification, + MTOPDomainClassification, + MTOPIntentClassification, + MultiHateClassification, + MultilingualSentimentClassification, + NaijaSenti, + NordicLangClassification, + NusaParagraphEmotionClassification, + NusaParagraphTopicClassification, + NusaXSentiClassification, + ScalaClassification, + SIB200Classification, + SouthAfricanLangClassification, + SwissJudgementClassification, + TurkicClassification, + TweetSentimentClassification, +) +from .mya import MyanmarNews +from .nep import NepaliNewsClassification +from .nld import DutchBookReviewSentimentClassification +from .nob import NoRecClassification, NorwegianParliamentClassification +from .ory import OdiaNewsClassification +from .pan import PunjabiNewsClassification +from .pol import ( + AllegroReviewsClassification, + CbdClassification, + PacClassification, + PolEmo2InClassification, + PolEmo2OutClassification, +) +from .por import HateSpeechPortugueseClassification +from .ron import Moroco, RomanianReviewsSentiment, RomanianSentimentClassification +from .rus import ( + GeoreviewClassification, + HeadlineClassification, + InappropriatenessClassification, + KinopoiskClassification, + RuReviewsClassification, + RuSciBenchGRNTIClassification, + RuSciBenchOECDClassification, +) +from .san import SanskritShlokasClassification +from .sin import SinhalaNewsClassification, SinhalaNewsSourceClassification +from .slk import ( + CSFDSKMovieReviewSentimentClassification, + SlovakHateSpeechClassification, +) +from .slv import FrenkSlClassification +from .spa import SpanishNewsClassification, SpanishSentimentClassification +from .ssw import SiswatiNewsClassification +from .svk import SlovakMovieReviewSentimentClassification +from .swa import SwahiliNewsClassification +from .swe import ( + DalajClassification, + SwedishSentimentClassification, + SweRecClassification, +) +from .tam import TamilNewsClassification +from .tel import TeluguAndhraJyotiNewsClassification +from .tha import WisesightSentimentClassification, WongnaiReviewsClassification +from .tsn import TswanaNewsClassification +from .tur import ( + TurkishMovieSentimentClassification, + TurkishProductSentimentClassification, +) +from .ukr import UkrFormalityClassification +from .urd import UrduRomanSentimentClassification +from .vie import VieStudentFeedbackClassification +from .zho import ( + IFlyTek, + JDReview, + MultilingualSentiment, + OnlineShopping, + TNews, + Waimai, + YueOpenriceReviewClassification, +) +from .zul import IsiZuluNewsClassification + +__all__ = [ + "TeluguAndhraJyotiNewsClassification", + "IFlyTek", + "JDReview", + "MultilingualSentiment", + "OnlineShopping", + "TNews", + "Waimai", + "YueOpenriceReviewClassification", + "HateSpeechPortugueseClassification", + "SpanishNewsClassification", + "SpanishSentimentClassification", + "AllegroReviewsClassification", + "CbdClassification", + "PacClassification", + "PolEmo2InClassification", + "PolEmo2OutClassification", + "BulgarianStoreReviewSentimentClassfication", + "KurdishSentimentClassification", + "ItaCaseholdClassification", + "ItalianLinguisticAcceptabilityClassification", + "GeorgianSentimentClassification", + "DalajClassification", + "SweRecClassification", + "SwedishSentimentClassification", + "CSFDSKMovieReviewSentimentClassification", + "SlovakHateSpeechClassification", + "NorwegianParliamentClassification", + "NoRecClassification", + "FilipinoHateSpeechClassification", + "FilipinoShopeeReviewsClassification", + "MarathiNewsClassification", + "IndonesianIdClickbaitClassification", + "IndonesianMongabayConservationClassification", + "UrduRomanSentimentClassification", + "MacedonianTweetSentimentClassification", + "FrenkSlClassification", + "SwahiliNewsClassification", + "FinToxicityClassification", + "KannadaNewsClassification", + "TenKGnadClassification", + "GermanPoliticiansTwitterSentimentClassification", + "PunjabiNewsClassification", + "TswanaNewsClassification", + "TweetSarcasmClassification", + "TweetEmotionClassification", + "RestaurantReviewSentimentClassification", + "HotelReviewSentimentClassification", + "OnlineStoreReviewSentimentClassification", + "AJGT", + "TurkishProductSentimentClassification", + "TurkishMovieSentimentClassification", + "NepaliNewsClassification", + "VieStudentFeedbackClassification", + "DutchBookReviewSentimentClassification", + "SiswatiNewsClassification", + "UkrFormalityClassification", + "SanskritShlokasClassification", + "SlovakMovieReviewSentimentClassification", + "AngryTweetsClassification", + "DdiscoCohesionClassification", + "DanishPoliticalCommentsClassification", + "DKHateClassification", + "LccSentimentClassification", + "TamilNewsClassification", + "CSFDCZMovieReviewSentimentClassification", + "CzechSubjectivityClassification", + "CzechProductReviewSentimentClassification", + "CzechSoMeSentimentClassification", + "EstonianValenceClassification", + "MyanmarNews", + "JavaneseIMDBClassification", + "YahooAnswersTopicsClassification", + "CUADAffiliateLicenseLicenseeLegalBenchClassification", + "CUADAffiliateLicenseLicensorLegalBenchClassification", + "CUADAntiAssignmentLegalBenchClassification", + "CUADAuditRightsLegalBenchClassification", + "CUADCapOnLiabilityLegalBenchClassification", + "CUADChangeOfControlLegalBenchClassification", + "CUADCompetitiveRestrictionExceptionLegalBenchClassification", + "CUADCovenantNotToSueLegalBenchClassification", + "CUADEffectiveDateLegalBenchClassification", + "CUADExclusivityLegalBenchClassification", + "CUADExpirationDateLegalBenchClassification", + "CUADGoverningLawLegalBenchClassification", + "CUADIPOwnershipAssignmentLegalBenchClassification", + "CUADInsuranceLegalBenchClassification", + "CUADIrrevocableOrPerpetualLicenseLegalBenchClassification", + "CUADJointIPOwnershipLegalBenchClassification", + "CUADLicenseGrantLegalBenchClassification", + "CUADLiquidatedDamagesLegalBenchClassification", + "CUADMinimumCommitmentLegalBenchClassification", + "CUADMostFavoredNationLegalBenchClassification", + "CUADNoSolicitOfCustomersLegalBenchClassification", + "CUADNoSolicitOfEmployeesLegalBenchClassification", + "CUADNonCompeteLegalBenchClassification", + "CUADNonDisparagementLegalBenchClassification", + "CUADNonTransferableLicenseLegalBenchClassification", + "CUADNoticePeriodToTerminateRenewalLegalBenchClassification", + "CUADPostTerminationServicesLegalBenchClassification", + "CUADPriceRestrictionsLegalBenchClassification", + "CUADRenewalTermLegalBenchClassification", + "CUADRevenueProfitSharingLegalBenchClassification", + "CUADRofrRofoRofnLegalBenchClassification", + "CUADSourceCodeEscrowLegalBenchClassification", + "CUADTerminationForConvenienceLegalBenchClassification", + "CUADThirdPartyBeneficiaryLegalBenchClassification", + "CUADUncappedLiabilityLegalBenchClassification", + "CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification", + "CUADVolumeRestrictionLegalBenchClassification", + "CUADWarrantyDurationLegalBenchClassification", + "CanadaTaxCourtOutcomesLegalBenchClassification", + "ContractNLIConfidentialityOfAgreementLegalBenchClassification", + "ContractNLIExplicitIdentificationLegalBenchClassification", + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification", + "ContractNLILimitedUseLegalBenchClassification", + "ContractNLINoLicensingLegalBenchClassification", + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification", + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification", + "ContractNLIPermissibleCopyLegalBenchClassification", + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification", + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification", + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification", + "ContractNLISharingWithEmployeesLegalBenchClassification", + "ContractNLISharingWithThirdPartiesLegalBenchClassification", + "ContractNLISurvivalOfObligationsLegalBenchClassification", + "CorporateLobbyingLegalBenchClassification", + "DefinitionClassificationLegalBenchClassification", + "Diversity1LegalBenchClassification", + "Diversity2LegalBenchClassification", + "Diversity3LegalBenchClassification", + "Diversity4LegalBenchClassification", + "Diversity5LegalBenchClassification", + "Diversity6LegalBenchClassification", + "FunctionOfDecisionSectionLegalBenchClassification", + "InsurancePolicyInterpretationLegalBenchClassification", + "InternationalCitizenshipQuestionsLegalBenchClassification", + "JCrewBlockerLegalBenchClassification", + "LearnedHandsBenefitsLegalBenchClassification", + "LearnedHandsBusinessLegalBenchClassification", + "LearnedHandsConsumerLegalBenchClassification", + "LearnedHandsCourtsLegalBenchClassification", + "LearnedHandsCrimeLegalBenchClassification", + "LearnedHandsDivorceLegalBenchClassification", + "LearnedHandsDomesticViolenceLegalBenchClassification", + "LearnedHandsEducationLegalBenchClassification", + "LearnedHandsEmploymentLegalBenchClassification", + "LearnedHandsEstatesLegalBenchClassification", + "LearnedHandsFamilyLegalBenchClassification", + "LearnedHandsHealthLegalBenchClassification", + "LearnedHandsHousingLegalBenchClassification", + "LearnedHandsImmigrationLegalBenchClassification", + "LearnedHandsTortsLegalBenchClassification", + "LearnedHandsTrafficLegalBenchClassification", + "LegalReasoningCausalityLegalBenchClassification", + "MAUDLegalBenchClassification", + "NYSJudicialEthicsLegalBenchClassification", + "OPP115DataRetentionLegalBenchClassification", + "OPP115DataSecurityLegalBenchClassification", + "OPP115DoNotTrackLegalBenchClassification", + "OPP115FirstPartyCollectionUseLegalBenchClassification", + "OPP115InternationalAndSpecificAudiencesLegalBenchClassification", + "OPP115PolicyChangeLegalBenchClassification", + "OPP115ThirdPartySharingCollectionLegalBenchClassification", + "OPP115UserAccessEditAndDeletionLegalBenchClassification", + "OPP115UserChoiceControlLegalBenchClassification", + "OralArgumentQuestionPurposeLegalBenchClassification", + "OverrulingLegalBenchClassification", + "PROALegalBenchClassification", + "PersonalJurisdictionLegalBenchClassification", + "SCDBPAccountabilityLegalBenchClassification", + "SCDBPAuditsLegalBenchClassification", + "SCDBPCertificationLegalBenchClassification", + "SCDBPTrainingLegalBenchClassification", + "SCDBPVerificationLegalBenchClassification", + "SCDDAccountabilityLegalBenchClassification", + "SCDDAuditsLegalBenchClassification", + "SCDDCertificationLegalBenchClassification", + "SCDDTrainingLegalBenchClassification", + "SCDDVerificationLegalBenchClassification", + "TelemarketingSalesRuleLegalBenchClassification", + "TextualismToolDictionariesLegalBenchClassification", + "TextualismToolPlainLegalBenchClassification", + "UCCVCommonLawLegalBenchClassification", + "UnfairTOSLegalBenchClassification", + "WikipediaChemEngSpecialtiesClassification", + "FinancialPhrasebankClassification", + "DBpediaClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaBioMetChemClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaLuminescenceClassification", + "WikipediaChemFieldsClassification", + "FrenkEnClassification", + "TweetTopicSingleClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaSaltsSemiconductorsClassification", + "AmazonPolarityClassification", + "NewsClassification", + "ToxicChatClassification", + "SDSEyeProtectionClassification", + "YelpReviewFullClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "PatentClassification", + "ImdbClassification", + "SDSGlovesClassification", + "WikipediaCompChemSpectroscopyClassification", + "ArxivClassification", + "EmotionClassification", + "PoemSentimentClassification", + "Banking77Classification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", + "DeepSentiPers", + "DigikalamagClassification", + "NLPTwitterAnalysisClassification", + "PersianTextEmotion", + "PersianTextTone", + "SIDClassification", + "SentimentDKSF", + "SynPerChatbotConvSAAnger", + "SynPerChatbotConvSAFear", + "SynPerChatbotConvSAFriendship", + "SynPerChatbotConvSAHappiness", + "SynPerChatbotConvSAJealousy", + "SynPerChatbotConvSALove", + "SynPerChatbotConvSASadness", + "SynPerChatbotConvSASatisfaction", + "SynPerChatbotConvSASurprise", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianFoodSentimentClassification", + "HebrewSentimentAnalysis", + "BengaliSentimentAnalysis", + "BengaliDocumentClassification", + "BengaliHateSpeechClassification", + "SinhalaNewsSourceClassification", + "SinhalaNewsClassification", + "WisesightSentimentClassification", + "WongnaiReviewsClassification", + "WRIMEClassification", + "RomanianReviewsSentiment", + "Moroco", + "RomanianSentimentClassification", + "KorSarcasmClassification", + "KorHateClassification", + "KorFin", + "KlueTC", + "IndicLangClassification", + "SouthAfricanLangClassification", + "SwissJudgementClassification", + "AmazonReviewsClassification", + "NaijaSenti", + "TurkicClassification", + "ScalaClassification", + "MultilingualSentimentClassification", + "SIB200Classification", + "NordicLangClassification", + "NusaParagraphTopicClassification", + "CyrillicTurkicLangClassification", + "IndicNLPNewsClassification", + "MassiveScenarioClassification", + "MTOPIntentClassification", + "NusaParagraphEmotionClassification", + "MultiHateClassification", + "AfriSentiClassification", + "IndicSentimentClassification", + "LanguageClassification", + "AfriSentiLangClassification", + "NusaXSentiClassification", + "MTOPDomainClassification", + "HinDialectClassification", + "CataloniaTweetClassification", + "TweetSentimentClassification", + "MassiveIntentClassification", + "AmazonCounterfactualClassification", + "MasakhaNEWSClassification", + "GujaratiNewsClassification", + "IsiZuluNewsClassification", + "KinopoiskClassification", + "HeadlineClassification", + "InappropriatenessClassification", + "RuSciBenchGRNTIClassification", + "RuSciBenchOECDClassification", + "RuReviewsClassification", + "GeoreviewClassification", + "OdiaNewsClassification", + "GreekLegalCodeClassification", + "MovieReviewSentimentClassification", + "FrenchBookReviews", + "FrenkHrClassification", + "HindiDiscourseClassification", + "SentimentAnalysisHindi", + "MalayalamNewsClassification", +] diff --git a/mteb/tasks/Classification/eng/__init__.py b/mteb/tasks/Classification/eng/__init__.py index 01b996a96d..c8d58941b5 100644 --- a/mteb/tasks/Classification/eng/__init__.py +++ b/mteb/tasks/Classification/eng/__init__.py @@ -125,12 +125,53 @@ from .NewsClassification import NewsClassification from .PatentClassification import PatentClassification from .PoemSentimentClassification import PoemSentimentClassification +from .SDSEyeProtectionClassification import SDSEyeProtectionClassification +from .SDSGlovesClassification import SDSGlovesClassification from .ToxicChatClassification import ToxicChatClassification from .ToxicConversationsClassification import ToxicConversationsClassification from .TweetSentimentExtractionClassification import ( TweetSentimentExtractionClassification, ) from .TweetTopicSingleClassification import TweetTopicSingleClassification +from .WikipediaBiolumNeurochemClassification import ( + WikipediaBiolumNeurochemClassification, +) +from .WikipediaBioMetChemClassification import WikipediaBioMetChemClassification +from .WikipediaChemEngSpecialtiesClassification import ( + WikipediaChemEngSpecialtiesClassification, +) +from .WikipediaChemFieldsClassification import WikipediaChemFieldsClassification +from .WikipediaChemistryTopicsClassification import ( + WikipediaChemistryTopicsClassification, +) +from .WikipediaCompChemSpectroscopyClassification import ( + WikipediaCompChemSpectroscopyClassification, +) +from .WikipediaCryobiologySeparationClassification import ( + WikipediaCryobiologySeparationClassification, +) +from .WikipediaCrystallographyAnalyticalClassification import ( + WikipediaCrystallographyAnalyticalClassification, +) +from .WikipediaGreenhouseEnantiopureClassification import ( + WikipediaGreenhouseEnantiopureClassification, +) +from .WikipediaIsotopesFissionClassification import ( + WikipediaIsotopesFissionClassification, +) +from .WikipediaLuminescenceClassification import WikipediaLuminescenceClassification +from .WikipediaOrganicInorganicClassification import ( + WikipediaOrganicInorganicClassification, +) +from .WikipediaSaltsSemiconductorsClassification import ( + WikipediaSaltsSemiconductorsClassification, +) +from .WikipediaSolidStateColloidalClassification import ( + WikipediaSolidStateColloidalClassification, +) +from .WikipediaTheoreticalAppliedClassification import ( + WikipediaTheoreticalAppliedClassification, +) from .YahooAnswersTopicsClassification import YahooAnswersTopicsClassification from .YelpReviewFullClassification import YelpReviewFullClassification @@ -248,20 +289,37 @@ "TextualismToolPlainLegalBenchClassification", "UCCVCommonLawLegalBenchClassification", "UnfairTOSLegalBenchClassification", + "WikipediaChemEngSpecialtiesClassification", "FinancialPhrasebankClassification", "DBpediaClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaBioMetChemClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaLuminescenceClassification", + "WikipediaChemFieldsClassification", "FrenkEnClassification", "TweetTopicSingleClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaSaltsSemiconductorsClassification", "AmazonPolarityClassification", "NewsClassification", "ToxicChatClassification", + "SDSEyeProtectionClassification", "YelpReviewFullClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", "ToxicConversationsClassification", "TweetSentimentExtractionClassification", "PatentClassification", "ImdbClassification", + "SDSGlovesClassification", + "WikipediaCompChemSpectroscopyClassification", "ArxivClassification", "EmotionClassification", "PoemSentimentClassification", "Banking77Classification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", ] diff --git a/mteb/tasks/Classification/fas/__init__.py b/mteb/tasks/Classification/fas/__init__.py index 1f35246357..add65811bf 100644 --- a/mteb/tasks/Classification/fas/__init__.py +++ b/mteb/tasks/Classification/fas/__init__.py @@ -1,5 +1,55 @@ from __future__ import annotations +from .FaMTEBClassification import ( + DeepSentiPers, + DigikalamagClassification, + NLPTwitterAnalysisClassification, + PersianTextEmotion, + PersianTextTone, + SentimentDKSF, + SIDClassification, + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, + SynPerChatbotConvSAToneChatbotClassification, + SynPerChatbotConvSAToneUserClassification, + SynPerChatbotRAGToneChatbotClassification, + SynPerChatbotRAGToneUserClassification, + SynPerChatbotSatisfactionLevelClassification, + SynPerChatbotToneChatbotClassification, + SynPerChatbotToneUserClassification, +) from .PersianFoodSentimentClassification import PersianFoodSentimentClassification -__all__ = ["PersianFoodSentimentClassification"] +__all__ = [ + "DeepSentiPers", + "DigikalamagClassification", + "NLPTwitterAnalysisClassification", + "PersianTextEmotion", + "PersianTextTone", + "SIDClassification", + "SentimentDKSF", + "SynPerChatbotConvSAAnger", + "SynPerChatbotConvSAFear", + "SynPerChatbotConvSAFriendship", + "SynPerChatbotConvSAHappiness", + "SynPerChatbotConvSAJealousy", + "SynPerChatbotConvSALove", + "SynPerChatbotConvSASadness", + "SynPerChatbotConvSASatisfaction", + "SynPerChatbotConvSASurprise", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianFoodSentimentClassification", +] diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 65d8b01246..4944f498b5 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -1,48 +1,195 @@ from __future__ import annotations -from .deu.BlurbsClusteringP2P import * -from .deu.BlurbsClusteringS2S import * -from .deu.TenKGnadClusteringP2P import * -from .deu.TenKGnadClusteringS2S import * -from .eng.ArxivClusteringP2P import * -from .eng.ArxivClusteringS2S import * -from .eng.ArXivHierarchicalClustering import * -from .eng.BigPatentClustering import * -from .eng.BiorxivClusteringP2P import * -from .eng.BiorxivClusteringS2S import * -from .eng.MedrxivClusteringP2P import * -from .eng.MedrxivClusteringS2S import * -from .eng.RedditClustering import * -from .eng.RedditClusteringP2P import * -from .eng.StackExchangeClustering import * -from .eng.StackExchangeClusteringP2P import * -from .eng.TwentyNewsgroupsClustering import * -from .eng.WikiCitiesClustering import * -from .eng.WikipediaChemistrySpecialtiesClustering import * -from .eng.WikipediaChemistryTopicsClustering import * -from .fas.FaMTEBClustering import * -from .fra.AlloProfClusteringP2P import * -from .fra.AlloProfClusteringS2S import * -from .fra.HALClusteringS2S import * -from .jpn.LivedoorNewsClustering import * -from .jpn.MewsC16JaClustering import * -from .multilingual.IndicReviewsClusteringP2P import * -from .multilingual.MasakhaNEWSClusteringP2P import * -from .multilingual.MasakhaNEWSClusteringS2S import * -from .multilingual.MLSUMClusteringP2P import * -from .multilingual.MLSUMClusteringS2S import * -from .multilingual.SIB200ClusteringS2S import * -from .multilingual.WikiClusteringP2P import * -from .nob.snl_clustering import * -from .nob.SNLHierarchicalClustering import * -from .nob.vg_clustering import * -from .nob.VGHierarchicalClustering import * -from .pol.PolishClustering import * -from .rom.RomaniBibleClustering import * -from .rus.GeoreviewClusteringP2P import * -from .rus.RuSciBenchGRNTIClusteringP2P import * -from .rus.RuSciBenchOECDClusteringP2P import * -from .spa.SpanishNewsClusteringP2P import * -from .swe.swedn_clustering import * -from .swe.SwednClustering import * -from .zho.CMTEBClustering import * +from .deu import ( + BlurbsClusteringP2P, + BlurbsClusteringP2PFast, + BlurbsClusteringS2S, + BlurbsClusteringS2SFast, + TenKGnadClusteringP2P, + TenKGnadClusteringP2PFast, + TenKGnadClusteringS2S, + TenKGnadClusteringS2SFast, +) +from .eng import ( + ArxivClusteringP2P, + ArxivClusteringP2PFast, + ArxivClusteringS2S, + ArXivHierarchicalClusteringP2P, + ArXivHierarchicalClusteringS2S, + BigPatentClustering, + BigPatentClusteringFast, + BiorxivClusteringP2P, + BiorxivClusteringP2PFast, + BiorxivClusteringS2S, + BiorxivClusteringS2SFast, + MedrxivClusteringP2P, + MedrxivClusteringP2PFast, + MedrxivClusteringS2S, + MedrxivClusteringS2SFast, + RedditClustering, + RedditClusteringP2P, + RedditFastClusteringP2P, + RedditFastClusteringS2S, + StackExchangeClustering, + StackExchangeClusteringFast, + StackExchangeClusteringP2P, + StackExchangeClusteringP2PFast, + TwentyNewsgroupsClustering, + TwentyNewsgroupsClusteringFast, + WikiCitiesClustering, + WikipediaChemistrySpecialtiesClustering, + WikipediaChemistryTopicsClustering, +) +from .fas import ( + BeytooteClustering, + DigikalamagClustering, + HamshahriClustring, + NLPTwitterAnalysisClustering, + SIDClustring, +) +from .fra import ( + AlloProfClusteringP2P, + AlloProfClusteringP2PFast, + AlloProfClusteringS2S, + AlloProfClusteringS2SFast, + HALClusteringS2S, + HALClusteringS2SFast, +) +from .jpn import LivedoorNewsClustering, LivedoorNewsClusteringv2, MewsC16JaClustering +from .multilingual import ( + IndicReviewsClusteringP2P, + MasakhaNEWSClusteringP2P, + MasakhaNEWSClusteringS2S, + MLSUMClusteringP2P, + MLSUMClusteringP2PFast, + MLSUMClusteringS2S, + MLSUMClusteringS2SFast, + SIB200ClusteringFast, + WikiClusteringFastP2P, + WikiClusteringP2P, +) +from .nob import ( + SNLClustering, + SNLHierarchicalClusteringP2P, + SNLHierarchicalClusteringS2S, + VGClustering, + VGHierarchicalClusteringP2P, + VGHierarchicalClusteringS2S, +) +from .pol import ( + EightTagsClustering, + EightTagsClusteringFast, + PlscClusteringP2P, + PlscClusteringP2PFast, + PlscClusteringS2S, + PlscClusteringS2SFast, +) +from .rom import RomaniBibleClustering +from .rus import ( + GeoreviewClusteringP2P, + RuSciBenchGRNTIClusteringP2P, + RuSciBenchOECDClusteringP2P, +) +from .spa import SpanishNewsClusteringP2P +from .swe import SwednClustering, SwednClusteringFastS2S, SwednClusteringP2P +from .zho import ( + CLSClusteringFastP2P, + CLSClusteringFastS2S, + CLSClusteringP2P, + CLSClusteringS2S, + ThuNewsClusteringFastP2P, + ThuNewsClusteringFastS2S, + ThuNewsClusteringP2P, + ThuNewsClusteringS2S, +) + +__all__ = [ + "CLSClusteringFastP2P", + "CLSClusteringFastS2S", + "CLSClusteringP2P", + "CLSClusteringS2S", + "ThuNewsClusteringFastP2P", + "ThuNewsClusteringFastS2S", + "ThuNewsClusteringP2P", + "ThuNewsClusteringS2S", + "SpanishNewsClusteringP2P", + "EightTagsClustering", + "EightTagsClusteringFast", + "PlscClusteringP2P", + "PlscClusteringP2PFast", + "PlscClusteringS2S", + "PlscClusteringS2SFast", + "SwednClustering", + "SwednClusteringFastS2S", + "SwednClusteringP2P", + "VGClustering", + "SNLHierarchicalClusteringP2P", + "SNLHierarchicalClusteringS2S", + "SNLClustering", + "VGHierarchicalClusteringP2P", + "VGHierarchicalClusteringS2S", + "BlurbsClusteringS2S", + "BlurbsClusteringS2SFast", + "TenKGnadClusteringP2P", + "TenKGnadClusteringP2PFast", + "TenKGnadClusteringS2S", + "TenKGnadClusteringS2SFast", + "BlurbsClusteringP2P", + "BlurbsClusteringP2PFast", + "RomaniBibleClustering", + "MedrxivClusteringS2S", + "MedrxivClusteringS2SFast", + "BiorxivClusteringS2S", + "BiorxivClusteringS2SFast", + "StackExchangeClustering", + "StackExchangeClusteringFast", + "RedditClustering", + "RedditFastClusteringS2S", + "ArxivClusteringS2S", + "ArxivClusteringP2P", + "ArxivClusteringP2PFast", + "MedrxivClusteringP2P", + "MedrxivClusteringP2PFast", + "WikipediaChemistryTopicsClustering", + "WikiCitiesClustering", + "BiorxivClusteringP2P", + "BiorxivClusteringP2PFast", + "TwentyNewsgroupsClustering", + "TwentyNewsgroupsClusteringFast", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "WikipediaChemistrySpecialtiesClustering", + "BigPatentClustering", + "BigPatentClusteringFast", + "StackExchangeClusteringP2P", + "StackExchangeClusteringP2PFast", + "RedditClusteringP2P", + "RedditFastClusteringP2P", + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", + "LivedoorNewsClustering", + "LivedoorNewsClusteringv2", + "MewsC16JaClustering", + "WikiClusteringFastP2P", + "WikiClusteringP2P", + "MLSUMClusteringS2S", + "MLSUMClusteringS2SFast", + "MasakhaNEWSClusteringS2S", + "MLSUMClusteringP2P", + "MLSUMClusteringP2PFast", + "IndicReviewsClusteringP2P", + "SIB200ClusteringFast", + "MasakhaNEWSClusteringP2P", + "GeoreviewClusteringP2P", + "RuSciBenchGRNTIClusteringP2P", + "RuSciBenchOECDClusteringP2P", + "HALClusteringS2S", + "HALClusteringS2SFast", + "AlloProfClusteringS2S", + "AlloProfClusteringS2SFast", + "AlloProfClusteringP2P", + "AlloProfClusteringP2PFast", +] diff --git a/mteb/tasks/Clustering/eng/__init__.py b/mteb/tasks/Clustering/eng/__init__.py index 39773845ba..16479e1053 100644 --- a/mteb/tasks/Clustering/eng/__init__.py +++ b/mteb/tasks/Clustering/eng/__init__.py @@ -26,6 +26,10 @@ TwentyNewsgroupsClusteringFast, ) from .WikiCitiesClustering import WikiCitiesClustering +from .WikipediaChemistrySpecialtiesClustering import ( + WikipediaChemistrySpecialtiesClustering, +) +from .WikipediaChemistryTopicsClustering import WikipediaChemistryTopicsClustering __all__ = [ "MedrxivClusteringS2S", @@ -41,6 +45,7 @@ "ArxivClusteringP2PFast", "MedrxivClusteringP2P", "MedrxivClusteringP2PFast", + "WikipediaChemistryTopicsClustering", "WikiCitiesClustering", "BiorxivClusteringP2P", "BiorxivClusteringP2PFast", @@ -48,6 +53,7 @@ "TwentyNewsgroupsClusteringFast", "ArXivHierarchicalClusteringP2P", "ArXivHierarchicalClusteringS2S", + "WikipediaChemistrySpecialtiesClustering", "BigPatentClustering", "BigPatentClusteringFast", "StackExchangeClusteringP2P", diff --git a/mteb/tasks/Clustering/fas/__init__.py b/mteb/tasks/Clustering/fas/__init__.py index e69de29bb2..8197f490b5 100644 --- a/mteb/tasks/Clustering/fas/__init__.py +++ b/mteb/tasks/Clustering/fas/__init__.py @@ -0,0 +1,17 @@ +from __future__ import annotations + +from .FaMTEBClustering import ( + BeytooteClustering, + DigikalamagClustering, + HamshahriClustring, + NLPTwitterAnalysisClustering, + SIDClustring, +) + +__all__ = [ + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", +] diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 6cd75ea144..b07ee5baab 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -1,29 +1,84 @@ from __future__ import annotations -from .ara.ArEntail import * -from .ces.CTKFactsNLI import * -from .deu.FalseFriendsDeEnPC import * -from .eng.LegalBenchPC import * -from .eng.PubChemAISentenceParaphrasePC import * -from .eng.PubChemSMILESPC import * -from .eng.PubChemSynonymPC import * -from .eng.PubChemWikiParagraphsPC import * -from .eng.SprintDuplicateQuestionsPC import * -from .eng.TwitterSemEval2015PC import * -from .eng.TwitterURLCorpusPC import * -from .fas.FaMTEBPairClassification import * -from .fas.FarsTail import * -from .hye.ArmenianParaphrasePC import * -from .ind.IndoNLI import * -from .kor.KlueNLI import * -from .multilingual.OpusparcusPC import * -from .multilingual.PawsXPairClassification import * -from .multilingual.PubChemWikiPairClassification import * -from .multilingual.RTE3 import * -from .multilingual.XNLI import * -from .multilingual.XStance import * -from .pol.PolishPC import * -from .por.Assin2RTE import * -from .por.SickBrPC import * -from .rus.TERRa import * -from .zho.CMTEBPairClassification import * +from .ara import ArEntail +from .ces import CTKFactsNLI +from .deu import FalseFriendsDeEnPC +from .eng import ( + LegalBenchPC, + PubChemAISentenceParaphrasePC, + PubChemSMILESPC, + PubChemSynonymPC, + PubChemWikiParagraphsPC, + SprintDuplicateQuestionsPC, + TwitterSemEval2015PC, + TwitterURLCorpusPC, +) +from .fas import ( + CExaPPC, + FarsiParaphraseDetection, + FarsTail, + ParsinluEntail, + ParsinluQueryParaphPC, + SynPerChatbotRAGFAQPC, + SynPerQAPC, + SynPerTextKeywordsPC, +) +from .hye import ArmenianParaphrasePC +from .ind import IndoNLI +from .kor import KlueNLI +from .multilingual import ( + RTE3, + XNLI, + XNLIV2, + IndicXnliPairClassification, + OpusparcusPC, + PawsXPairClassification, + PubChemWikiPairClassification, + XStance, +) +from .pol import CdscePC, PpcPC, PscPC, SickePLPC +from .por import Assin2RTE, SickBrPC +from .rus import TERRa +from .zho import Cmnli, Ocnli + +__all__ = [ + "Cmnli", + "Ocnli", + "Assin2RTE", + "SickBrPC", + "CdscePC", + "PpcPC", + "PscPC", + "SickePLPC", + "IndoNLI", + "FalseFriendsDeEnPC", + "ArEntail", + "ArmenianParaphrasePC", + "CTKFactsNLI", + "PubChemSMILESPC", + "PubChemSynonymPC", + "LegalBenchPC", + "TwitterSemEval2015PC", + "PubChemWikiParagraphsPC", + "TwitterURLCorpusPC", + "SprintDuplicateQuestionsPC", + "PubChemAISentenceParaphrasePC", + "FarsTail", + "CExaPPC", + "FarsiParaphraseDetection", + "ParsinluEntail", + "ParsinluQueryParaphPC", + "SynPerChatbotRAGFAQPC", + "SynPerQAPC", + "SynPerTextKeywordsPC", + "KlueNLI", + "IndicXnliPairClassification", + "OpusparcusPC", + "PawsXPairClassification", + "RTE3", + "PubChemWikiPairClassification", + "XStance", + "XNLI", + "XNLIV2", + "TERRa", +] diff --git a/mteb/tasks/PairClassification/eng/__init__.py b/mteb/tasks/PairClassification/eng/__init__.py index 5fdb9d7080..97fd0218f8 100644 --- a/mteb/tasks/PairClassification/eng/__init__.py +++ b/mteb/tasks/PairClassification/eng/__init__.py @@ -1,13 +1,21 @@ from __future__ import annotations from .LegalBenchPC import LegalBenchPC +from .PubChemAISentenceParaphrasePC import PubChemAISentenceParaphrasePC +from .PubChemSMILESPC import PubChemSMILESPC +from .PubChemSynonymPC import PubChemSynonymPC +from .PubChemWikiParagraphsPC import PubChemWikiParagraphsPC from .SprintDuplicateQuestionsPC import SprintDuplicateQuestionsPC from .TwitterSemEval2015PC import TwitterSemEval2015PC from .TwitterURLCorpusPC import TwitterURLCorpusPC __all__ = [ + "PubChemSMILESPC", + "PubChemSynonymPC", "LegalBenchPC", "TwitterSemEval2015PC", + "PubChemWikiParagraphsPC", "TwitterURLCorpusPC", "SprintDuplicateQuestionsPC", + "PubChemAISentenceParaphrasePC", ] diff --git a/mteb/tasks/PairClassification/fas/__init__.py b/mteb/tasks/PairClassification/fas/__init__.py index 81e6cffc84..e9166d374a 100644 --- a/mteb/tasks/PairClassification/fas/__init__.py +++ b/mteb/tasks/PairClassification/fas/__init__.py @@ -1,5 +1,23 @@ from __future__ import annotations +from .FaMTEBPairClassification import ( + CExaPPC, + FarsiParaphraseDetection, + ParsinluEntail, + ParsinluQueryParaphPC, + SynPerChatbotRAGFAQPC, + SynPerQAPC, + SynPerTextKeywordsPC, +) from .FarsTail import FarsTail -__all__ = ["FarsTail"] +__all__ = [ + "FarsTail", + "CExaPPC", + "FarsiParaphraseDetection", + "ParsinluEntail", + "ParsinluQueryParaphPC", + "SynPerChatbotRAGFAQPC", + "SynPerQAPC", + "SynPerTextKeywordsPC", +] diff --git a/mteb/tasks/PairClassification/multilingual/__init__.py b/mteb/tasks/PairClassification/multilingual/__init__.py index a9ec6dc333..7533c04a7e 100644 --- a/mteb/tasks/PairClassification/multilingual/__init__.py +++ b/mteb/tasks/PairClassification/multilingual/__init__.py @@ -3,6 +3,7 @@ from .IndicXnliPairClassification import IndicXnliPairClassification from .OpusparcusPC import OpusparcusPC from .PawsXPairClassification import PawsXPairClassification +from .PubChemWikiPairClassification import PubChemWikiPairClassification from .RTE3 import RTE3 from .XNLI import XNLI, XNLIV2 from .XStance import XStance @@ -12,6 +13,7 @@ "OpusparcusPC", "PawsXPairClassification", "RTE3", + "PubChemWikiPairClassification", "XStance", "XNLI", "XNLIV2", diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 06414da081..cdda72f77a 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -1,165 +1,448 @@ from __future__ import annotations -from .ara.SadeemQuestionRetrieval import * -from .code.AppsRetrieval import * -from .code.CodeEditSearchRetrieval import * -from .code.CodeFeedbackMTRetrieval import * -from .code.CodeFeedbackSTRetrieval import * -from .code.CodeRAG import * -from .code.CodeSearchNetCCRetrieval import * -from .code.CodeSearchNetRetrieval import * -from .code.CodeTransOceanContestRetrieval import * -from .code.CodeTransOceanDLRetrieval import * -from .code.COIRCodeSearchNetRetrieval import * -from .code.CosQARetrieval import * -from .code.StackOverflowQARetrieval import * -from .code.SyntheticText2SqlRetrieval import * -from .dan.DanFeverRetrieval import * -from .dan.TV2Nordretrieval import * -from .dan.TwitterHjerneRetrieval import * -from .deu.GerDaLIRRetrieval import * -from .deu.GerDaLIRSmallRetrieval import * -from .deu.GermanDPRRetrieval import * -from .deu.GermanGovServiceRetrieval import * -from .deu.GermanQuADRetrieval import * -from .deu.LegalQuADRetrieval import * -from .ell.GreekCivicsQA import * -from .eng.AILACasedocsRetrieval import * -from .eng.AILAStatutesRetrieval import * -from .eng.AlphaNLIRetrieval import * -from .eng.ARCChallengeRetrieval import * -from .eng.ArguAnaRetrieval import * -from .eng.BrightRetrieval import * -from .eng.ChemHotpotQARetrieval import * -from .eng.ChemNQRetrieval import * -from .eng.ClimateFEVERRetrieval import * -from .eng.CQADupstackAndroidRetrieval import * -from .eng.CQADupstackEnglishRetrieval import * -from .eng.CQADupstackGamingRetrieval import * -from .eng.CQADupstackGisRetrieval import * -from .eng.CQADupstackMathematicaRetrieval import * -from .eng.CQADupstackPhysicsRetrieval import * -from .eng.CQADupstackProgrammersRetrieval import * -from .eng.CQADupstackStatsRetrieval import * -from .eng.CQADupstackTexRetrieval import * -from .eng.CQADupstackUnixRetrieval import * -from .eng.CQADupstackWebmastersRetrieval import * -from .eng.CQADupstackWordpressRetrieval import * -from .eng.DBPediaRetrieval import * -from .eng.FaithDialRetrieval import * -from .eng.FeedbackQARetrieval import * -from .eng.FEVERRetrieval import * -from .eng.FiQA2018Retrieval import * -from .eng.HagridRetrieval import * -from .eng.HellaSwagRetrieval import * -from .eng.HotpotQARetrieval import * -from .eng.LegalBenchConsumerContractsQARetrieval import * -from .eng.LegalBenchCorporateLobbyingRetrieval import * -from .eng.LegalSummarizationRetrieval import * -from .eng.LEMBNarrativeQARetrieval import * -from .eng.LEMBNeedleRetrieval import * -from .eng.LEMBPasskeyRetrieval import * -from .eng.LEMBQMSumRetrieval import * -from .eng.LEMBSummScreenFDRetrieval import * -from .eng.LEMBWikimQARetrieval import * -from .eng.LitSearchRetrieval import * -from .eng.MedicalQARetrieval import * -from .eng.MLQuestions import * -from .eng.MSMARCORetrieval import * -from .eng.MSMARCOv2Retrieval import * -from .eng.NanoArguAnaRetrieval import * -from .eng.NanoClimateFeverRetrieval import * -from .eng.NanoDBPediaRetrieval import * -from .eng.NanoFEVERRetrieval import * -from .eng.NanoFiQA2018Retrieval import * -from .eng.NanoHotpotQARetrieval import * -from .eng.NanoMSMARCORetrieval import * -from .eng.NanoNFCorpusRetrieval import * -from .eng.NanoNQRetrieval import * -from .eng.NanoQuoraRetrieval import * -from .eng.NanoSCIDOCSRetrieval import * -from .eng.NanoSciFactRetrieval import * -from .eng.NanoTouche2020Retrieval import * -from .eng.NarrativeQARetrieval import * -from .eng.NFCorpusRetrieval import * -from .eng.NQRetrieval import * -from .eng.PiqaRetrieval import * -from .eng.QuailRetrieval import * -from .eng.QuoraRetrieval import * -from .eng.RARbCodeRetrieval import * -from .eng.RARbMathRetrieval import * -from .eng.SCIDOCSRetrieval import * -from .eng.SciFactRetrieval import * -from .eng.SiqaRetrieval import * -from .eng.SpartQARetrieval import * -from .eng.TempReasonL1Retrieval import * -from .eng.TempReasonL2ContextRetrieval import * -from .eng.TempReasonL2FactRetrieval import * -from .eng.TempReasonL2PureRetrieval import * -from .eng.TempReasonL3ContextRetrieval import * -from .eng.TempReasonL3FactRetrieval import * -from .eng.TempReasonL3PureRetrieval import * -from .eng.TopiOCQARetrieval import * -from .eng.Touche2020Retrieval import * -from .eng.TRECCOVIDRetrieval import * -from .eng.WinoGrandeRetrieval import * -from .est.estqa import * -from .fas.BEIRFa import * -from .fas.FaMTEBRetrieval import * -from .fra.AlloprofRetrieval import * -from .fra.BSARDRetrieval import * -from .fra.FQuADRetrieval import * -from .fra.SyntecRetrieval import * -from .hun.HunSum2 import * -from .jpn.JaGovFaqsRetrieval import * -from .jpn.JaqketRetrieval import * -from .jpn.JaQuADRetrieval import * -from .jpn.NLPJournalAbsIntroRetrieval import * -from .jpn.NLPJournalTitleAbsRetrieval import * -from .jpn.NLPJournalTitleIntroRetrieval import * -from .kat.GeorgianFAQRetrieval import * -from .kor.AutoRAGRetrieval import * -from .kor.KoStrategyQA import * -from .multilingual.BelebeleRetrieval import * -from .multilingual.CrossLingualSemanticDiscriminationWMT19 import * -from .multilingual.CrossLingualSemanticDiscriminationWMT21 import * -from .multilingual.CUREv1Retrieval import * -from .multilingual.IndicQARetrieval import * -from .multilingual.MintakaRetrieval import * -from .multilingual.MIRACLRetrieval import * -from .multilingual.MLQARetrieval import * -from .multilingual.MrTidyRetrieval import * -from .multilingual.MultiLongDocRetrieval import * -from .multilingual.NeuCLIR2022Retrieval import * -from .multilingual.NeuCLIR2023Retrieval import * -from .multilingual.PublicHealthQARetrieval import * -from .multilingual.StatcanDialogueDatasetRetrieval import * -from .multilingual.WikipediaRetrievalMultilingual import * -from .multilingual.XMarketRetrieval import * -from .multilingual.XPQARetrieval import * -from .multilingual.XQuADRetrieval import * -from .nob.norquad import * -from .nob.snl_retrieval import * -from .pol.ArguAnaPLRetrieval import * -from .pol.DBPediaPLRetrieval import * -from .pol.FiQAPLRetrieval import * -from .pol.HotpotQAPLRetrieval import * -from .pol.MSMARCOPLRetrieval import * -from .pol.NFCorpusPLRetrieval import * -from .pol.NQPLRetrieval import * -from .pol.QuoraPLRetrieval import * -from .pol.SCIDOCSPLRetrieval import * -from .pol.SciFactPLRetrieval import * -from .pol.TRECCOVIDPLRetrieval import * -from .rus.RiaNewsRetrieval import * -from .rus.RuBQRetrieval import * -from .slk.SKQuadRetrieval import * -from .slk.SlovakSumRetrieval import * -from .spa.SpanishPassageRetrievalS2P import * -from .spa.SpanishPassageRetrievalS2S import * -from .swe.SwednRetrieval import * -from .swe.SweFaqRetrieval import * -from .tur.TurHistQuad import * -from .vie.VieQuADRetrieval import * -from .zho.CMTEBRetrieval import * -from .zho.LeCaRDv2Retrieval import * +from .ara import SadeemQuestionRetrieval +from .code import ( + AppsRetrieval, + CodeEditSearchRetrieval, + CodeFeedbackMT, + CodeFeedbackST, + CodeRAGLibraryDocumentationSolutionsRetrieval, + CodeRAGOnlineTutorialsRetrieval, + CodeRAGProgrammingSolutionsRetrieval, + CodeRAGStackoverflowPostsRetrieval, + CodeSearchNetCCRetrieval, + CodeSearchNetRetrieval, + CodeTransOceanContestRetrieval, + CodeTransOceanDLRetrieval, + COIRCodeSearchNetRetrieval, + CosQARetrieval, + StackOverflowQARetrieval, + SyntheticText2SQLRetrieval, +) +from .dan import DanFever, DanFeverRetrieval, TV2Nordretrieval, TwitterHjerneRetrieval +from .deu import ( + GerDaLIR, + GerDaLIRSmall, + GermanDPR, + GermanGovServiceRetrieval, + GermanQuADRetrieval, + LegalQuAD, +) +from .ell import GreekCivicsQA +from .eng import ( + FEVER, + MSMARCO, + NQ, + PIQA, + SCIDOCS, + SIQA, + TRECCOVID, + AILACasedocs, + AILAStatutes, + AlphaNLI, + ARCChallenge, + ArguAna, + BrightRetrieval, + ChemHotpotQARetrieval, + ChemNQRetrieval, + ClimateFEVER, + ClimateFEVERHardNegatives, + CQADupstackAndroidRetrieval, + CQADupstackEnglishRetrieval, + CQADupstackGamingRetrieval, + CQADupstackGisRetrieval, + CQADupstackMathematicaRetrieval, + CQADupstackPhysicsRetrieval, + CQADupstackProgrammersRetrieval, + CQADupstackStatsRetrieval, + CQADupstackTexRetrieval, + CQADupstackUnixRetrieval, + CQADupstackWebmastersRetrieval, + CQADupstackWordpressRetrieval, + DBPedia, + DBPediaHardNegatives, + FaithDialRetrieval, + FeedbackQARetrieval, + FEVERHardNegatives, + FiQA2018, + HagridRetrieval, + HellaSwag, + HotpotQA, + HotpotQAHardNegatives, + LegalBenchConsumerContractsQA, + LegalBenchCorporateLobbying, + LegalSummarization, + LEMBNarrativeQARetrieval, + LEMBNeedleRetrieval, + LEMBPasskeyRetrieval, + LEMBQMSumRetrieval, + LEMBSummScreenFDRetrieval, + LEMBWikimQARetrieval, + LitSearchRetrieval, + MedicalQARetrieval, + MLQuestionsRetrieval, + MSMARCOHardNegatives, + MSMARCOv2, + NanoArguAnaRetrieval, + NanoClimateFeverRetrieval, + NanoDBPediaRetrieval, + NanoFEVERRetrieval, + NanoFiQA2018Retrieval, + NanoHotpotQARetrieval, + NanoMSMARCORetrieval, + NanoNFCorpusRetrieval, + NanoNQRetrieval, + NanoQuoraRetrieval, + NanoSCIDOCSRetrieval, + NanoSciFactRetrieval, + NanoTouche2020Retrieval, + NarrativeQARetrieval, + NFCorpus, + NQHardNegatives, + Quail, + QuoraRetrieval, + QuoraRetrievalHardNegatives, + RARbCode, + RARbMath, + SciFact, + SpartQA, + TempReasonL1, + TempReasonL2Context, + TempReasonL2Fact, + TempReasonL2Pure, + TempReasonL3Context, + TempReasonL3Fact, + TempReasonL3Pure, + TopiOCQARetrieval, + TopiOCQARetrievalHardNegatives, + Touche2020, + Touche2020v3Retrieval, + WinoGrande, +) +from .est import EstQA +from .fas import ( + ArguAnaFa, + ClimateFEVERFa, + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrievalFa, + DBPediaFa, + FiQA2018Fa, + HotpotQAFa, + MSMARCOFa, + NFCorpusFa, + NQFa, + PersianWebDocumentRetrieval, + QuoraRetrievalFa, + SCIDOCSFa, + SciFactFa, + SynPerChatbotRAGFAQRetrieval, + SynPerChatbotRAGTopicsRetrieval, + SynPerChatbotTopicsRetrieval, + SynPerQARetrieval, + Touche2020Fa, + TRECCOVIDFa, +) +from .fra import AlloprofRetrieval, BSARDRetrieval, FQuADRetrieval, SyntecRetrieval +from .hun import HunSum2AbstractiveRetrieval +from .jpn import ( + JaGovFaqsRetrieval, + JaqketRetrieval, + JaQuADRetrieval, + NLPJournalAbsIntroRetrieval, + NLPJournalTitleAbsRetrieval, + NLPJournalTitleIntroRetrieval, +) +from .kat import GeorgianFAQRetrieval +from .kor import AutoRAGRetrieval, KoStrategyQA +from .multilingual import ( + BelebeleRetrieval, + CrossLingualSemanticDiscriminationWMT19, + CrossLingualSemanticDiscriminationWMT21, + CUREv1Retrieval, + IndicQARetrieval, + MintakaRetrieval, + MIRACLRetrieval, + MIRACLRetrievalHardNegatives, + MLQARetrieval, + MrTidyRetrieval, + MultiLongDocRetrieval, + NeuCLIR2022Retrieval, + NeuCLIR2022RetrievalHardNegatives, + NeuCLIR2023Retrieval, + NeuCLIR2023RetrievalHardNegatives, + PublicHealthQARetrieval, + StatcanDialogueDatasetRetrieval, + WikipediaRetrievalMultilingual, + XMarket, + XPQARetrieval, + XQuADRetrieval, +) +from .nob import NorQuadRetrieval, SNLRetrieval +from .pol import ( + MSMARCOPL, + NQPL, + SCIDOCSPL, + TRECCOVIDPL, + ArguAnaPL, + DBPediaPL, + DBPediaPLHardNegatives, + FiQAPLRetrieval, + HotpotQAPL, + HotpotQAPLHardNegatives, + MSMARCOPLHardNegatives, + NFCorpusPL, + NQPLHardNegatives, + QuoraPLRetrieval, + QuoraPLRetrievalHardNegatives, + SciFactPL, +) +from .rus import RiaNewsRetrieval, RiaNewsRetrievalHardNegatives, RuBQRetrieval +from .slk import SKQuadRetrieval, SlovakSumRetrieval +from .spa import SpanishPassageRetrievalS2P, SpanishPassageRetrievalS2S +from .swe import SwednRetrieval, SweFaqRetrieval +from .tur import TurHistQuadRetrieval +from .vie import VieQuADRetrieval +from .zho import ( + CmedqaRetrieval, + CovidRetrieval, + DuRetrieval, + EcomRetrieval, + LeCaRDv2, + MedicalRetrieval, + MMarcoRetrieval, + T2Retrieval, + VideoRetrieval, +) + +__all__ = [ + "CmedqaRetrieval", + "CovidRetrieval", + "DuRetrieval", + "EcomRetrieval", + "MMarcoRetrieval", + "MedicalRetrieval", + "T2Retrieval", + "VideoRetrieval", + "LeCaRDv2", + "SpanishPassageRetrievalS2S", + "SpanishPassageRetrievalS2P", + "MSMARCOPL", + "MSMARCOPLHardNegatives", + "SCIDOCSPL", + "SciFactPL", + "ArguAnaPL", + "FiQAPLRetrieval", + "NFCorpusPL", + "QuoraPLRetrieval", + "QuoraPLRetrievalHardNegatives", + "TRECCOVIDPL", + "NQPL", + "NQPLHardNegatives", + "DBPediaPL", + "DBPediaPLHardNegatives", + "HotpotQAPL", + "HotpotQAPLHardNegatives", + "GeorgianFAQRetrieval", + "SwednRetrieval", + "SweFaqRetrieval", + "SlovakSumRetrieval", + "SKQuadRetrieval", + "SNLRetrieval", + "NorQuadRetrieval", + "GermanQuADRetrieval", + "GerDaLIRSmall", + "GermanDPR", + "GermanGovServiceRetrieval", + "LegalQuAD", + "GerDaLIR", + "SadeemQuestionRetrieval", + "TurHistQuadRetrieval", + "VieQuADRetrieval", + "DanFever", + "DanFeverRetrieval", + "TV2Nordretrieval", + "TwitterHjerneRetrieval", + "EstQA", + "Quail", + "Touche2020", + "Touche2020v3Retrieval", + "TempReasonL2Pure", + "LegalSummarization", + "NQ", + "NQHardNegatives", + "SIQA", + "MSMARCO", + "MSMARCOHardNegatives", + "DBPedia", + "DBPediaHardNegatives", + "NarrativeQARetrieval", + "MSMARCOv2", + "CQADupstackTexRetrieval", + "TRECCOVID", + "WinoGrande", + "QuoraRetrieval", + "QuoraRetrievalHardNegatives", + "AlphaNLI", + "LEMBNeedleRetrieval", + "LEMBPasskeyRetrieval", + "CQADupstackAndroidRetrieval", + "TempReasonL2Context", + "NanoDBPediaRetrieval", + "ARCChallenge", + "ChemHotpotQARetrieval", + "LegalBenchCorporateLobbying", + "SCIDOCS", + "MedicalQARetrieval", + "RARbCode", + "LEMBQMSumRetrieval", + "TempReasonL3Context", + "AILAStatutes", + "TopiOCQARetrieval", + "TopiOCQARetrievalHardNegatives", + "ClimateFEVER", + "ClimateFEVERHardNegatives", + "CQADupstackWordpressRetrieval", + "CQADupstackEnglishRetrieval", + "NanoTouche2020Retrieval", + "CQADupstackStatsRetrieval", + "MLQuestionsRetrieval", + "TempReasonL2Fact", + "NanoSciFactRetrieval", + "CQADupstackGamingRetrieval", + "CQADupstackWebmastersRetrieval", + "NanoFiQA2018Retrieval", + "CQADupstackUnixRetrieval", + "TempReasonL3Pure", + "CQADupstackPhysicsRetrieval", + "FiQA2018", + "LitSearchRetrieval", + "NanoFEVERRetrieval", + "NanoMSMARCORetrieval", + "FeedbackQARetrieval", + "HagridRetrieval", + "NanoNFCorpusRetrieval", + "FaithDialRetrieval", + "SciFact", + "CQADupstackMathematicaRetrieval", + "RARbMath", + "NanoNQRetrieval", + "HellaSwag", + "PIQA", + "SpartQA", + "BrightRetrieval", + "TempReasonL1", + "HotpotQA", + "HotpotQAHardNegatives", + "NanoClimateFeverRetrieval", + "NanoQuoraRetrieval", + "NanoArguAnaRetrieval", + "LegalBenchConsumerContractsQA", + "NanoHotpotQARetrieval", + "ArguAna", + "LEMBWikimQARetrieval", + "TempReasonL3Fact", + "FEVER", + "FEVERHardNegatives", + "CQADupstackGisRetrieval", + "NanoSCIDOCSRetrieval", + "AILACasedocs", + "NFCorpus", + "ChemNQRetrieval", + "LEMBSummScreenFDRetrieval", + "LEMBNarrativeQARetrieval", + "CQADupstackProgrammersRetrieval", + "PersianWebDocumentRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerQARetrieval", + "ArguAnaFa", + "CQADupstackAndroidRetrievalFa", + "CQADupstackEnglishRetrievalFa", + "CQADupstackGamingRetrievalFa", + "CQADupstackGisRetrievalFa", + "CQADupstackMathematicaRetrievalFa", + "CQADupstackPhysicsRetrievalFa", + "CQADupstackProgrammersRetrievalFa", + "CQADupstackStatsRetrievalFa", + "CQADupstackTexRetrievalFa", + "CQADupstackUnixRetrievalFa", + "CQADupstackWebmastersRetrievalFa", + "CQADupstackWordpressRetrievalFa", + "ClimateFEVERFa", + "DBPediaFa", + "FiQA2018Fa", + "HotpotQAFa", + "MSMARCOFa", + "NFCorpusFa", + "NQFa", + "QuoraRetrievalFa", + "SCIDOCSFa", + "SciFactFa", + "TRECCOVIDFa", + "Touche2020Fa", + "JaGovFaqsRetrieval", + "NLPJournalAbsIntroRetrieval", + "JaqketRetrieval", + "NLPJournalTitleAbsRetrieval", + "JaQuADRetrieval", + "NLPJournalTitleIntroRetrieval", + "HunSum2AbstractiveRetrieval", + "AutoRAGRetrieval", + "KoStrategyQA", + "WikipediaRetrievalMultilingual", + "MintakaRetrieval", + "PublicHealthQARetrieval", + "CrossLingualSemanticDiscriminationWMT19", + "MultiLongDocRetrieval", + "MIRACLRetrieval", + "MIRACLRetrievalHardNegatives", + "NeuCLIR2022Retrieval", + "NeuCLIR2022RetrievalHardNegatives", + "StatcanDialogueDatasetRetrieval", + "IndicQARetrieval", + "NeuCLIR2023Retrieval", + "NeuCLIR2023RetrievalHardNegatives", + "CrossLingualSemanticDiscriminationWMT21", + "XMarket", + "XPQARetrieval", + "BelebeleRetrieval", + "CUREv1Retrieval", + "MLQARetrieval", + "XQuADRetrieval", + "MrTidyRetrieval", + "CodeTransOceanContestRetrieval", + "CodeTransOceanDLRetrieval", + "CodeFeedbackMT", + "CodeRAGLibraryDocumentationSolutionsRetrieval", + "CodeRAGOnlineTutorialsRetrieval", + "CodeRAGProgrammingSolutionsRetrieval", + "CodeRAGStackoverflowPostsRetrieval", + "CodeSearchNetCCRetrieval", + "StackOverflowQARetrieval", + "CodeFeedbackST", + "CosQARetrieval", + "CodeEditSearchRetrieval", + "SyntheticText2SQLRetrieval", + "AppsRetrieval", + "CodeSearchNetRetrieval", + "COIRCodeSearchNetRetrieval", + "RiaNewsRetrieval", + "RiaNewsRetrievalHardNegatives", + "RuBQRetrieval", + "GreekCivicsQA", + "AlloprofRetrieval", + "BSARDRetrieval", + "SyntecRetrieval", + "FQuADRetrieval", +] diff --git a/mteb/tasks/Retrieval/code/__init__.py b/mteb/tasks/Retrieval/code/__init__.py index bc2a2fad79..d78296f394 100644 --- a/mteb/tasks/Retrieval/code/__init__.py +++ b/mteb/tasks/Retrieval/code/__init__.py @@ -4,6 +4,12 @@ from .CodeEditSearchRetrieval import CodeEditSearchRetrieval from .CodeFeedbackMTRetrieval import CodeFeedbackMT from .CodeFeedbackSTRetrieval import CodeFeedbackST +from .CodeRAG import ( + CodeRAGLibraryDocumentationSolutionsRetrieval, + CodeRAGOnlineTutorialsRetrieval, + CodeRAGProgrammingSolutionsRetrieval, + CodeRAGStackoverflowPostsRetrieval, +) from .CodeSearchNetCCRetrieval import CodeSearchNetCCRetrieval from .CodeSearchNetRetrieval import CodeSearchNetRetrieval from .CodeTransOceanContestRetrieval import CodeTransOceanContestRetrieval @@ -17,6 +23,10 @@ "CodeTransOceanContestRetrieval", "CodeTransOceanDLRetrieval", "CodeFeedbackMT", + "CodeRAGLibraryDocumentationSolutionsRetrieval", + "CodeRAGOnlineTutorialsRetrieval", + "CodeRAGProgrammingSolutionsRetrieval", + "CodeRAGStackoverflowPostsRetrieval", "CodeSearchNetCCRetrieval", "StackOverflowQARetrieval", "CodeFeedbackST", diff --git a/mteb/tasks/Retrieval/eng/__init__.py b/mteb/tasks/Retrieval/eng/__init__.py index 9f1717a89f..02d0474b74 100644 --- a/mteb/tasks/Retrieval/eng/__init__.py +++ b/mteb/tasks/Retrieval/eng/__init__.py @@ -6,6 +6,8 @@ from .ARCChallengeRetrieval import ARCChallenge from .ArguAnaRetrieval import ArguAna from .BrightRetrieval import BrightRetrieval +from .ChemHotpotQARetrieval import ChemHotpotQARetrieval +from .ChemNQRetrieval import ChemNQRetrieval from .ClimateFEVERRetrieval import ClimateFEVER, ClimateFEVERHardNegatives from .CQADupstackAndroidRetrieval import CQADupstackAndroidRetrieval from .CQADupstackEnglishRetrieval import CQADupstackEnglishRetrieval @@ -105,6 +107,7 @@ "TempReasonL2Context", "NanoDBPediaRetrieval", "ARCChallenge", + "ChemHotpotQARetrieval", "LegalBenchCorporateLobbying", "SCIDOCS", "MedicalQARetrieval", @@ -162,6 +165,7 @@ "NanoSCIDOCSRetrieval", "AILACasedocs", "NFCorpus", + "ChemNQRetrieval", "LEMBSummScreenFDRetrieval", "LEMBNarrativeQARetrieval", "CQADupstackProgrammersRetrieval", diff --git a/mteb/tasks/Retrieval/fas/__init__.py b/mteb/tasks/Retrieval/fas/__init__.py index e69de29bb2..c6c6c7f12d 100644 --- a/mteb/tasks/Retrieval/fas/__init__.py +++ b/mteb/tasks/Retrieval/fas/__init__.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +from .BEIRFa import ( + ArguAnaFa, + ClimateFEVERFa, + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrievalFa, + DBPediaFa, + FiQA2018Fa, + HotpotQAFa, + MSMARCOFa, + NFCorpusFa, + NQFa, + QuoraRetrievalFa, + SCIDOCSFa, + SciFactFa, + Touche2020Fa, + TRECCOVIDFa, +) +from .FaMTEBRetrieval import ( + PersianWebDocumentRetrieval, + SynPerChatbotRAGFAQRetrieval, + SynPerChatbotRAGTopicsRetrieval, + SynPerChatbotTopicsRetrieval, + SynPerQARetrieval, +) + +__all__ = [ + "PersianWebDocumentRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerQARetrieval", + "ArguAnaFa", + "CQADupstackAndroidRetrievalFa", + "CQADupstackEnglishRetrievalFa", + "CQADupstackGamingRetrievalFa", + "CQADupstackGisRetrievalFa", + "CQADupstackMathematicaRetrievalFa", + "CQADupstackPhysicsRetrievalFa", + "CQADupstackProgrammersRetrievalFa", + "CQADupstackStatsRetrievalFa", + "CQADupstackTexRetrievalFa", + "CQADupstackUnixRetrievalFa", + "CQADupstackWebmastersRetrievalFa", + "CQADupstackWordpressRetrievalFa", + "ClimateFEVERFa", + "DBPediaFa", + "FiQA2018Fa", + "HotpotQAFa", + "MSMARCOFa", + "NFCorpusFa", + "NQFa", + "QuoraRetrievalFa", + "SCIDOCSFa", + "SciFactFa", + "TRECCOVIDFa", + "Touche2020Fa", +] diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index 471789f1c9..d22db8fc03 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -1,32 +1,76 @@ from __future__ import annotations -from .deu.GermanSTSBenchmarkSTS import * -from .eng.BiossesSTS import * -from .eng.SickrSTS import * -from .eng.STS12STS import * -from .eng.STS13STS import * -from .eng.STS14STS import * -from .eng.STS15STS import * -from .eng.STS16STS import * -from .eng.STSBenchmarkSTS import * -from .fao.FaroeseSTS import * -from .fas.FaMTEBSTS import * -from .fin.FinParaSTS import * -from .fra.SickFrSTS import * -from .jpn.JSICK import * -from .jpn.JSTS import * -from .kor.KlueSTS import * -from .kor.KorSTS import * -from .multilingual.IndicCrosslingualSTS import * -from .multilingual.SemRel24STS import * -from .multilingual.STS17CrosslingualSTS import * -from .multilingual.STS22CrosslingualSTS import * -from .multilingual.STSBenchmarkMultilingualSTS import * -from .pol.PolishSTS import * -from .por.Assin2STS import * -from .por.SickBrSTS import * -from .ron.RonSTS import * -from .rus.RUParaPhraserSTS import * -from .rus.RuSTSBenchmarkSTS import * -from .spa.STSES import * -from .zho.CMTEBSTS import * +from .deu import GermanSTSBenchmarkSTS +from .eng import ( + STS12STS, + STS13STS, + STS14STS, + STS15STS, + STS16STS, + BiossesSTS, + SickrSTS, + STSBenchmarkSTS, +) +from .fao import FaroeseSTS +from .fas import Farsick, Query2Query, SynPerSTS +from .fin import FinParaSTS +from .fra import SickFrSTS +from .jpn import JSICK, JSTS +from .kor import KlueSTS, KorSTS +from .multilingual import ( + IndicCrosslingualSTS, + SemRel24STS, + STS17Crosslingual, + STS22CrosslingualSTS, + STS22CrosslingualSTSv2, + STSBenchmarkMultilingualSTS, +) +from .pol import CdscrSTS, SickrPLSTS +from .por import Assin2STS, SickBrSTS +from .ron import RonSTS +from .rus import RUParaPhraserSTS, RuSTSBenchmarkSTS +from .spa import STSES +from .zho import AFQMC, ATEC, BQ, LCQMC, PAWSX, QBQTC, STSB + +__all__ = [ + "AFQMC", + "ATEC", + "BQ", + "LCQMC", + "PAWSX", + "QBQTC", + "STSB", + "Assin2STS", + "SickBrSTS", + "STSES", + "CdscrSTS", + "SickrPLSTS", + "FinParaSTS", + "GermanSTSBenchmarkSTS", + "STS12STS", + "STS13STS", + "BiossesSTS", + "STS15STS", + "STSBenchmarkSTS", + "SickrSTS", + "STS16STS", + "STS14STS", + "Farsick", + "Query2Query", + "SynPerSTS", + "FaroeseSTS", + "JSICK", + "JSTS", + "RonSTS", + "KorSTS", + "KlueSTS", + "IndicCrosslingualSTS", + "SemRel24STS", + "STS17Crosslingual", + "STS22CrosslingualSTS", + "STS22CrosslingualSTSv2", + "STSBenchmarkMultilingualSTS", + "RUParaPhraserSTS", + "RuSTSBenchmarkSTS", + "SickFrSTS", +] diff --git a/mteb/tasks/STS/fas/__init__.py b/mteb/tasks/STS/fas/__init__.py index e69de29bb2..edbc7f66a1 100644 --- a/mteb/tasks/STS/fas/__init__.py +++ b/mteb/tasks/STS/fas/__init__.py @@ -0,0 +1,5 @@ +from __future__ import annotations + +from .FaMTEBSTS import Farsick, Query2Query, SynPerSTS + +__all__ = ["Farsick", "Query2Query", "SynPerSTS"] diff --git a/mteb/tasks/SummaryRetrieval/__init__.py b/mteb/tasks/SummaryRetrieval/__init__.py index d000983be9..be14f1282a 100644 --- a/mteb/tasks/SummaryRetrieval/__init__.py +++ b/mteb/tasks/SummaryRetrieval/__init__.py @@ -1,3 +1,5 @@ from __future__ import annotations -from .fas.FaMTEBSummaryRetrieval import * +from .fas import SAMSumFa, SynPerChatbotRAGSumSRetrieval, SynPerChatbotSumSRetrieval + +__all__ = ["SAMSumFa", "SynPerChatbotRAGSumSRetrieval", "SynPerChatbotSumSRetrieval"] diff --git a/mteb/tasks/SummaryRetrieval/fas/__init__.py b/mteb/tasks/SummaryRetrieval/fas/__init__.py index e69de29bb2..70bf55ad70 100644 --- a/mteb/tasks/SummaryRetrieval/fas/__init__.py +++ b/mteb/tasks/SummaryRetrieval/fas/__init__.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +from .FaMTEBSummaryRetrieval import ( + SAMSumFa, + SynPerChatbotRAGSumSRetrieval, + SynPerChatbotSumSRetrieval, +) + +__all__ = ["SAMSumFa", "SynPerChatbotRAGSumSRetrieval", "SynPerChatbotSumSRetrieval"] diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 8877da707c..49bc4dc53c 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -1,15 +1,1538 @@ from __future__ import annotations -from .aggregated_tasks import * -from .BitextMining import * -from .Classification import * -from .Clustering import * -from .InstructionRetrieval import * -from .MultiLabelClassification import * -from .PairClassification import * -from .Reranking import * -from .Retrieval import * -from .SpeedTask import * -from .STS import * -from .Summarization import * -from .SummaryRetrieval import * +from .aggregated_tasks import ( + CQADupstackRetrieval, + CQADupstackRetrievalFa, + SynPerChatbotConvSAClassification, +) +from .BitextMining import ( + BibleNLPBitextMining, + BornholmBitextMining, + BUCCBitextMining, + BUCCBitextMiningFast, + DiaBLaBitextMining, + FloresBitextMining, + IN22ConvBitextMining, + IN22GenBitextMining, + IndicGenBenchFloresBitextMining, + IWSLT2017BitextMining, + LinceMTBitextMining, + NollySentiBitextMining, + NorwegianCourtsBitextMining, + NTREXBitextMining, + NusaTranslationBitextMining, + NusaXBitextMining, + PhincBitextMining, + PubChemSMILESBitextMining, + RomaTalesBitextMining, + SRNCorpusBitextMining, + TatoebaBitextMining, + TbilisiCityHallBitextMining, + VieMedEVBitextMining, +) +from .Classification import ( + AJGT, + AfriSentiClassification, + AfriSentiLangClassification, + AllegroReviewsClassification, + AmazonCounterfactualClassification, + AmazonPolarityClassification, + AmazonReviewsClassification, + AngryTweetsClassification, + ArxivClassification, + Banking77Classification, + BengaliDocumentClassification, + BengaliHateSpeechClassification, + BengaliSentimentAnalysis, + BulgarianStoreReviewSentimentClassfication, + CanadaTaxCourtOutcomesLegalBenchClassification, + CataloniaTweetClassification, + CbdClassification, + ContractNLIConfidentialityOfAgreementLegalBenchClassification, + ContractNLIExplicitIdentificationLegalBenchClassification, + ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification, + ContractNLILimitedUseLegalBenchClassification, + ContractNLINoLicensingLegalBenchClassification, + ContractNLINoticeOnCompelledDisclosureLegalBenchClassification, + ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification, + ContractNLIPermissibleCopyLegalBenchClassification, + ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification, + ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification, + ContractNLIReturnOfConfidentialInformationLegalBenchClassification, + ContractNLISharingWithEmployeesLegalBenchClassification, + ContractNLISharingWithThirdPartiesLegalBenchClassification, + ContractNLISurvivalOfObligationsLegalBenchClassification, + CorporateLobbyingLegalBenchClassification, + CSFDCZMovieReviewSentimentClassification, + CSFDSKMovieReviewSentimentClassification, + CUADAffiliateLicenseLicenseeLegalBenchClassification, + CUADAffiliateLicenseLicensorLegalBenchClassification, + CUADAntiAssignmentLegalBenchClassification, + CUADAuditRightsLegalBenchClassification, + CUADCapOnLiabilityLegalBenchClassification, + CUADChangeOfControlLegalBenchClassification, + CUADCompetitiveRestrictionExceptionLegalBenchClassification, + CUADCovenantNotToSueLegalBenchClassification, + CUADEffectiveDateLegalBenchClassification, + CUADExclusivityLegalBenchClassification, + CUADExpirationDateLegalBenchClassification, + CUADGoverningLawLegalBenchClassification, + CUADInsuranceLegalBenchClassification, + CUADIPOwnershipAssignmentLegalBenchClassification, + CUADIrrevocableOrPerpetualLicenseLegalBenchClassification, + CUADJointIPOwnershipLegalBenchClassification, + CUADLicenseGrantLegalBenchClassification, + CUADLiquidatedDamagesLegalBenchClassification, + CUADMinimumCommitmentLegalBenchClassification, + CUADMostFavoredNationLegalBenchClassification, + CUADNonCompeteLegalBenchClassification, + CUADNonDisparagementLegalBenchClassification, + CUADNonTransferableLicenseLegalBenchClassification, + CUADNoSolicitOfCustomersLegalBenchClassification, + CUADNoSolicitOfEmployeesLegalBenchClassification, + CUADNoticePeriodToTerminateRenewalLegalBenchClassification, + CUADPostTerminationServicesLegalBenchClassification, + CUADPriceRestrictionsLegalBenchClassification, + CUADRenewalTermLegalBenchClassification, + CUADRevenueProfitSharingLegalBenchClassification, + CUADRofrRofoRofnLegalBenchClassification, + CUADSourceCodeEscrowLegalBenchClassification, + CUADTerminationForConvenienceLegalBenchClassification, + CUADThirdPartyBeneficiaryLegalBenchClassification, + CUADUncappedLiabilityLegalBenchClassification, + CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification, + CUADVolumeRestrictionLegalBenchClassification, + CUADWarrantyDurationLegalBenchClassification, + CyrillicTurkicLangClassification, + CzechProductReviewSentimentClassification, + CzechSoMeSentimentClassification, + CzechSubjectivityClassification, + DalajClassification, + DanishPoliticalCommentsClassification, + DBpediaClassification, + DdiscoCohesionClassification, + DeepSentiPers, + DefinitionClassificationLegalBenchClassification, + DigikalamagClassification, + Diversity1LegalBenchClassification, + Diversity2LegalBenchClassification, + Diversity3LegalBenchClassification, + Diversity4LegalBenchClassification, + Diversity5LegalBenchClassification, + Diversity6LegalBenchClassification, + DKHateClassification, + DutchBookReviewSentimentClassification, + EmotionClassification, + EstonianValenceClassification, + FilipinoHateSpeechClassification, + FilipinoShopeeReviewsClassification, + FinancialPhrasebankClassification, + FinToxicityClassification, + FrenchBookReviews, + FrenkEnClassification, + FrenkHrClassification, + FrenkSlClassification, + FunctionOfDecisionSectionLegalBenchClassification, + GeoreviewClassification, + GeorgianSentimentClassification, + GermanPoliticiansTwitterSentimentClassification, + GreekLegalCodeClassification, + GujaratiNewsClassification, + HateSpeechPortugueseClassification, + HeadlineClassification, + HebrewSentimentAnalysis, + HinDialectClassification, + HindiDiscourseClassification, + HotelReviewSentimentClassification, + IFlyTek, + ImdbClassification, + InappropriatenessClassification, + IndicLangClassification, + IndicNLPNewsClassification, + IndicSentimentClassification, + IndonesianIdClickbaitClassification, + IndonesianMongabayConservationClassification, + InsurancePolicyInterpretationLegalBenchClassification, + InternationalCitizenshipQuestionsLegalBenchClassification, + IsiZuluNewsClassification, + ItaCaseholdClassification, + ItalianLinguisticAcceptabilityClassification, + JavaneseIMDBClassification, + JCrewBlockerLegalBenchClassification, + JDReview, + KannadaNewsClassification, + KinopoiskClassification, + KlueTC, + KorFin, + KorHateClassification, + KorSarcasmClassification, + KurdishSentimentClassification, + LanguageClassification, + LccSentimentClassification, + LearnedHandsBenefitsLegalBenchClassification, + LearnedHandsBusinessLegalBenchClassification, + LearnedHandsConsumerLegalBenchClassification, + LearnedHandsCourtsLegalBenchClassification, + LearnedHandsCrimeLegalBenchClassification, + LearnedHandsDivorceLegalBenchClassification, + LearnedHandsDomesticViolenceLegalBenchClassification, + LearnedHandsEducationLegalBenchClassification, + LearnedHandsEmploymentLegalBenchClassification, + LearnedHandsEstatesLegalBenchClassification, + LearnedHandsFamilyLegalBenchClassification, + LearnedHandsHealthLegalBenchClassification, + LearnedHandsHousingLegalBenchClassification, + LearnedHandsImmigrationLegalBenchClassification, + LearnedHandsTortsLegalBenchClassification, + LearnedHandsTrafficLegalBenchClassification, + LegalReasoningCausalityLegalBenchClassification, + MacedonianTweetSentimentClassification, + MalayalamNewsClassification, + MarathiNewsClassification, + MasakhaNEWSClassification, + MassiveIntentClassification, + MassiveScenarioClassification, + MAUDLegalBenchClassification, + Moroco, + MovieReviewSentimentClassification, + MTOPDomainClassification, + MTOPIntentClassification, + MultiHateClassification, + MultilingualSentiment, + MultilingualSentimentClassification, + MyanmarNews, + NaijaSenti, + NepaliNewsClassification, + NewsClassification, + NLPTwitterAnalysisClassification, + NordicLangClassification, + NoRecClassification, + NorwegianParliamentClassification, + NusaParagraphEmotionClassification, + NusaParagraphTopicClassification, + NusaXSentiClassification, + NYSJudicialEthicsLegalBenchClassification, + OdiaNewsClassification, + OnlineShopping, + OnlineStoreReviewSentimentClassification, + OPP115DataRetentionLegalBenchClassification, + OPP115DataSecurityLegalBenchClassification, + OPP115DoNotTrackLegalBenchClassification, + OPP115FirstPartyCollectionUseLegalBenchClassification, + OPP115InternationalAndSpecificAudiencesLegalBenchClassification, + OPP115PolicyChangeLegalBenchClassification, + OPP115ThirdPartySharingCollectionLegalBenchClassification, + OPP115UserAccessEditAndDeletionLegalBenchClassification, + OPP115UserChoiceControlLegalBenchClassification, + OralArgumentQuestionPurposeLegalBenchClassification, + OverrulingLegalBenchClassification, + PacClassification, + PatentClassification, + PersianFoodSentimentClassification, + PersianTextEmotion, + PersianTextTone, + PersonalJurisdictionLegalBenchClassification, + PoemSentimentClassification, + PolEmo2InClassification, + PolEmo2OutClassification, + PROALegalBenchClassification, + PunjabiNewsClassification, + RestaurantReviewSentimentClassification, + RomanianReviewsSentiment, + RomanianSentimentClassification, + RuReviewsClassification, + RuSciBenchGRNTIClassification, + RuSciBenchOECDClassification, + SanskritShlokasClassification, + ScalaClassification, + SCDBPAccountabilityLegalBenchClassification, + SCDBPAuditsLegalBenchClassification, + SCDBPCertificationLegalBenchClassification, + SCDBPTrainingLegalBenchClassification, + SCDBPVerificationLegalBenchClassification, + SCDDAccountabilityLegalBenchClassification, + SCDDAuditsLegalBenchClassification, + SCDDCertificationLegalBenchClassification, + SCDDTrainingLegalBenchClassification, + SCDDVerificationLegalBenchClassification, + SDSEyeProtectionClassification, + SDSGlovesClassification, + SentimentAnalysisHindi, + SentimentDKSF, + SIB200Classification, + SIDClassification, + SinhalaNewsClassification, + SinhalaNewsSourceClassification, + SiswatiNewsClassification, + SlovakHateSpeechClassification, + SlovakMovieReviewSentimentClassification, + SouthAfricanLangClassification, + SpanishNewsClassification, + SpanishSentimentClassification, + SwahiliNewsClassification, + SwedishSentimentClassification, + SweRecClassification, + SwissJudgementClassification, + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, + SynPerChatbotConvSAToneChatbotClassification, + SynPerChatbotConvSAToneUserClassification, + SynPerChatbotRAGToneChatbotClassification, + SynPerChatbotRAGToneUserClassification, + SynPerChatbotSatisfactionLevelClassification, + SynPerChatbotToneChatbotClassification, + SynPerChatbotToneUserClassification, + TamilNewsClassification, + TelemarketingSalesRuleLegalBenchClassification, + TeluguAndhraJyotiNewsClassification, + TenKGnadClassification, + TextualismToolDictionariesLegalBenchClassification, + TextualismToolPlainLegalBenchClassification, + TNews, + ToxicChatClassification, + ToxicConversationsClassification, + TswanaNewsClassification, + TurkicClassification, + TurkishMovieSentimentClassification, + TurkishProductSentimentClassification, + TweetEmotionClassification, + TweetSarcasmClassification, + TweetSentimentClassification, + TweetSentimentExtractionClassification, + TweetTopicSingleClassification, + UCCVCommonLawLegalBenchClassification, + UkrFormalityClassification, + UnfairTOSLegalBenchClassification, + UrduRomanSentimentClassification, + VieStudentFeedbackClassification, + Waimai, + WikipediaBiolumNeurochemClassification, + WikipediaBioMetChemClassification, + WikipediaChemEngSpecialtiesClassification, + WikipediaChemFieldsClassification, + WikipediaChemistryTopicsClassification, + WikipediaCompChemSpectroscopyClassification, + WikipediaCryobiologySeparationClassification, + WikipediaCrystallographyAnalyticalClassification, + WikipediaGreenhouseEnantiopureClassification, + WikipediaIsotopesFissionClassification, + WikipediaLuminescenceClassification, + WikipediaOrganicInorganicClassification, + WikipediaSaltsSemiconductorsClassification, + WikipediaSolidStateColloidalClassification, + WikipediaTheoreticalAppliedClassification, + WisesightSentimentClassification, + WongnaiReviewsClassification, + WRIMEClassification, + YahooAnswersTopicsClassification, + YelpReviewFullClassification, + YueOpenriceReviewClassification, +) +from .Clustering import ( + AlloProfClusteringP2P, + AlloProfClusteringP2PFast, + AlloProfClusteringS2S, + AlloProfClusteringS2SFast, + ArxivClusteringP2P, + ArxivClusteringP2PFast, + ArxivClusteringS2S, + ArXivHierarchicalClusteringP2P, + ArXivHierarchicalClusteringS2S, + BeytooteClustering, + BigPatentClustering, + BigPatentClusteringFast, + BiorxivClusteringP2P, + BiorxivClusteringP2PFast, + BiorxivClusteringS2S, + BiorxivClusteringS2SFast, + BlurbsClusteringP2P, + BlurbsClusteringP2PFast, + BlurbsClusteringS2S, + BlurbsClusteringS2SFast, + CLSClusteringFastP2P, + CLSClusteringFastS2S, + CLSClusteringP2P, + CLSClusteringS2S, + DigikalamagClustering, + EightTagsClustering, + EightTagsClusteringFast, + GeoreviewClusteringP2P, + HALClusteringS2S, + HALClusteringS2SFast, + HamshahriClustring, + IndicReviewsClusteringP2P, + LivedoorNewsClustering, + LivedoorNewsClusteringv2, + MasakhaNEWSClusteringP2P, + MasakhaNEWSClusteringS2S, + MedrxivClusteringP2P, + MedrxivClusteringP2PFast, + MedrxivClusteringS2S, + MedrxivClusteringS2SFast, + MewsC16JaClustering, + MLSUMClusteringP2P, + MLSUMClusteringP2PFast, + MLSUMClusteringS2S, + MLSUMClusteringS2SFast, + NLPTwitterAnalysisClustering, + PlscClusteringP2P, + PlscClusteringP2PFast, + PlscClusteringS2S, + PlscClusteringS2SFast, + RedditClustering, + RedditClusteringP2P, + RedditFastClusteringP2P, + RedditFastClusteringS2S, + RomaniBibleClustering, + RuSciBenchGRNTIClusteringP2P, + RuSciBenchOECDClusteringP2P, + SIB200ClusteringFast, + SIDClustring, + SNLClustering, + SNLHierarchicalClusteringP2P, + SNLHierarchicalClusteringS2S, + SpanishNewsClusteringP2P, + StackExchangeClustering, + StackExchangeClusteringFast, + StackExchangeClusteringP2P, + StackExchangeClusteringP2PFast, + SwednClustering, + SwednClusteringFastS2S, + SwednClusteringP2P, + TenKGnadClusteringP2P, + TenKGnadClusteringP2PFast, + TenKGnadClusteringS2S, + TenKGnadClusteringS2SFast, + ThuNewsClusteringFastP2P, + ThuNewsClusteringFastS2S, + ThuNewsClusteringP2P, + ThuNewsClusteringS2S, + TwentyNewsgroupsClustering, + TwentyNewsgroupsClusteringFast, + VGClustering, + VGHierarchicalClusteringP2P, + VGHierarchicalClusteringS2S, + WikiCitiesClustering, + WikiClusteringFastP2P, + WikiClusteringP2P, + WikipediaChemistrySpecialtiesClustering, + WikipediaChemistryTopicsClustering, +) +from .InstructionReranking import ( + Core17InstructionRetrieval, + News21InstructionRetrieval, + Robust04InstructionRetrieval, + mFollowIR, + mFollowIRCrossLingual, +) +from .InstructionRetrieval import InstructIR +from .MultiLabelClassification import ( + BrazilianToxicTweetsClassification, + CEDRClassification, + KorHateSpeechMLClassification, + MalteseNewsClassification, + MultiEURLEXMultilabelClassification, + SensitiveTopicsClassification, +) +from .PairClassification import ( + RTE3, + XNLI, + XNLIV2, + ArEntail, + ArmenianParaphrasePC, + Assin2RTE, + CdscePC, + CExaPPC, + Cmnli, + CTKFactsNLI, + FalseFriendsDeEnPC, + FarsiParaphraseDetection, + FarsTail, + IndicXnliPairClassification, + IndoNLI, + KlueNLI, + LegalBenchPC, + Ocnli, + OpusparcusPC, + ParsinluEntail, + ParsinluQueryParaphPC, + PawsXPairClassification, + PpcPC, + PscPC, + PubChemAISentenceParaphrasePC, + PubChemSMILESPC, + PubChemSynonymPC, + PubChemWikiPairClassification, + PubChemWikiParagraphsPC, + SickBrPC, + SickePLPC, + SprintDuplicateQuestionsPC, + SynPerChatbotRAGFAQPC, + SynPerQAPC, + SynPerTextKeywordsPC, + TERRa, + TwitterSemEval2015PC, + TwitterURLCorpusPC, + XStance, +) +from .Reranking import ( + AlloprofReranking, + AskUbuntuDupQuestions, + CMedQAv1, + CMedQAv2, + ESCIReranking, + MindSmallReranking, + MIRACLReranking, + MMarcoReranking, + NamaaMrTydiReranking, + NevIR, + RuBQReranking, + SciDocsReranking, + StackOverflowDupQuestions, + SyntecReranking, + T2Reranking, + VoyageMMarcoReranking, + WebLINXCandidatesReranking, + WikipediaRerankingMultilingual, +) +from .Retrieval import ( + FEVER, + MSMARCO, + MSMARCOPL, + NQ, + NQPL, + PIQA, + SCIDOCS, + SCIDOCSPL, + SIQA, + TRECCOVID, + TRECCOVIDPL, + AILACasedocs, + AILAStatutes, + AlloprofRetrieval, + AlphaNLI, + AppsRetrieval, + ARCChallenge, + ArguAna, + ArguAnaFa, + ArguAnaPL, + AutoRAGRetrieval, + BelebeleRetrieval, + BrightRetrieval, + BSARDRetrieval, + ChemHotpotQARetrieval, + ChemNQRetrieval, + ClimateFEVER, + ClimateFEVERFa, + ClimateFEVERHardNegatives, + CmedqaRetrieval, + CodeEditSearchRetrieval, + CodeFeedbackMT, + CodeFeedbackST, + CodeRAGLibraryDocumentationSolutionsRetrieval, + CodeRAGOnlineTutorialsRetrieval, + CodeRAGProgrammingSolutionsRetrieval, + CodeRAGStackoverflowPostsRetrieval, + CodeSearchNetCCRetrieval, + CodeSearchNetRetrieval, + CodeTransOceanContestRetrieval, + CodeTransOceanDLRetrieval, + COIRCodeSearchNetRetrieval, + CosQARetrieval, + CovidRetrieval, + CQADupstackAndroidRetrieval, + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrieval, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrieval, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrieval, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrieval, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrieval, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrieval, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrieval, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrieval, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrieval, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrieval, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrieval, + CQADupstackWordpressRetrievalFa, + CrossLingualSemanticDiscriminationWMT19, + CrossLingualSemanticDiscriminationWMT21, + CUREv1Retrieval, + DanFever, + DanFeverRetrieval, + DBPedia, + DBPediaFa, + DBPediaHardNegatives, + DBPediaPL, + DBPediaPLHardNegatives, + DuRetrieval, + EcomRetrieval, + EstQA, + FaithDialRetrieval, + FeedbackQARetrieval, + FEVERHardNegatives, + FiQA2018, + FiQA2018Fa, + FiQAPLRetrieval, + FQuADRetrieval, + GeorgianFAQRetrieval, + GerDaLIR, + GerDaLIRSmall, + GermanDPR, + GermanGovServiceRetrieval, + GermanQuADRetrieval, + GreekCivicsQA, + HagridRetrieval, + HellaSwag, + HotpotQA, + HotpotQAFa, + HotpotQAHardNegatives, + HotpotQAPL, + HotpotQAPLHardNegatives, + HunSum2AbstractiveRetrieval, + IndicQARetrieval, + JaGovFaqsRetrieval, + JaqketRetrieval, + JaQuADRetrieval, + KoStrategyQA, + LeCaRDv2, + LegalBenchConsumerContractsQA, + LegalBenchCorporateLobbying, + LegalQuAD, + LegalSummarization, + LEMBNarrativeQARetrieval, + LEMBNeedleRetrieval, + LEMBPasskeyRetrieval, + LEMBQMSumRetrieval, + LEMBSummScreenFDRetrieval, + LEMBWikimQARetrieval, + LitSearchRetrieval, + MedicalQARetrieval, + MedicalRetrieval, + MintakaRetrieval, + MIRACLRetrieval, + MIRACLRetrievalHardNegatives, + MLQARetrieval, + MLQuestionsRetrieval, + MMarcoRetrieval, + MrTidyRetrieval, + MSMARCOFa, + MSMARCOHardNegatives, + MSMARCOPLHardNegatives, + MSMARCOv2, + MultiLongDocRetrieval, + NanoArguAnaRetrieval, + NanoClimateFeverRetrieval, + NanoDBPediaRetrieval, + NanoFEVERRetrieval, + NanoFiQA2018Retrieval, + NanoHotpotQARetrieval, + NanoMSMARCORetrieval, + NanoNFCorpusRetrieval, + NanoNQRetrieval, + NanoQuoraRetrieval, + NanoSCIDOCSRetrieval, + NanoSciFactRetrieval, + NanoTouche2020Retrieval, + NarrativeQARetrieval, + NeuCLIR2022Retrieval, + NeuCLIR2022RetrievalHardNegatives, + NeuCLIR2023Retrieval, + NeuCLIR2023RetrievalHardNegatives, + NFCorpus, + NFCorpusFa, + NFCorpusPL, + NLPJournalAbsIntroRetrieval, + NLPJournalTitleAbsRetrieval, + NLPJournalTitleIntroRetrieval, + NorQuadRetrieval, + NQFa, + NQHardNegatives, + NQPLHardNegatives, + PersianWebDocumentRetrieval, + PublicHealthQARetrieval, + Quail, + QuoraPLRetrieval, + QuoraPLRetrievalHardNegatives, + QuoraRetrieval, + QuoraRetrievalFa, + QuoraRetrievalHardNegatives, + RARbCode, + RARbMath, + RiaNewsRetrieval, + RiaNewsRetrievalHardNegatives, + RuBQRetrieval, + SadeemQuestionRetrieval, + SCIDOCSFa, + SciFact, + SciFactFa, + SciFactPL, + SKQuadRetrieval, + SlovakSumRetrieval, + SNLRetrieval, + SpanishPassageRetrievalS2P, + SpanishPassageRetrievalS2S, + SpartQA, + StackOverflowQARetrieval, + StatcanDialogueDatasetRetrieval, + SwednRetrieval, + SweFaqRetrieval, + SynPerChatbotRAGFAQRetrieval, + SynPerChatbotRAGTopicsRetrieval, + SynPerChatbotTopicsRetrieval, + SynPerQARetrieval, + SyntecRetrieval, + SyntheticText2SQLRetrieval, + T2Retrieval, + TempReasonL1, + TempReasonL2Context, + TempReasonL2Fact, + TempReasonL2Pure, + TempReasonL3Context, + TempReasonL3Fact, + TempReasonL3Pure, + TopiOCQARetrieval, + TopiOCQARetrievalHardNegatives, + Touche2020, + Touche2020Fa, + Touche2020v3Retrieval, + TRECCOVIDFa, + TurHistQuadRetrieval, + TV2Nordretrieval, + TwitterHjerneRetrieval, + VideoRetrieval, + VieQuADRetrieval, + WikipediaRetrievalMultilingual, + WinoGrande, + XMarket, + XPQARetrieval, + XQuADRetrieval, +) +from .SpeedTask import CPUSpeedTask, GPUSpeedTask +from .STS import ( + AFQMC, + ATEC, + BQ, + JSICK, + JSTS, + LCQMC, + PAWSX, + QBQTC, + STS12STS, + STS13STS, + STS14STS, + STS15STS, + STS16STS, + STSB, + STSES, + Assin2STS, + BiossesSTS, + CdscrSTS, + FaroeseSTS, + Farsick, + FinParaSTS, + GermanSTSBenchmarkSTS, + IndicCrosslingualSTS, + KlueSTS, + KorSTS, + Query2Query, + RonSTS, + RUParaPhraserSTS, + RuSTSBenchmarkSTS, + SemRel24STS, + SickBrSTS, + SickFrSTS, + SickrPLSTS, + SickrSTS, + STS17Crosslingual, + STS22CrosslingualSTS, + STS22CrosslingualSTSv2, + STSBenchmarkMultilingualSTS, + STSBenchmarkSTS, + SynPerSTS, +) +from .Summarization import ( + SummEvalFrSummarization, + SummEvalFrSummarizationv2, + SummEvalSummarization, + SummEvalSummarizationv2, +) +from .SummaryRetrieval import ( + SAMSumFa, + SynPerChatbotRAGSumSRetrieval, + SynPerChatbotSumSRetrieval, +) + +__all__ = [ + "CLSClusteringFastP2P", + "CLSClusteringFastS2S", + "CLSClusteringP2P", + "CLSClusteringS2S", + "ThuNewsClusteringFastP2P", + "ThuNewsClusteringFastS2S", + "ThuNewsClusteringP2P", + "ThuNewsClusteringS2S", + "SpanishNewsClusteringP2P", + "EightTagsClustering", + "EightTagsClusteringFast", + "PlscClusteringP2P", + "PlscClusteringP2PFast", + "PlscClusteringS2S", + "PlscClusteringS2SFast", + "SwednClustering", + "SwednClusteringFastS2S", + "SwednClusteringP2P", + "VGClustering", + "SNLHierarchicalClusteringP2P", + "SNLHierarchicalClusteringS2S", + "SNLClustering", + "VGHierarchicalClusteringP2P", + "VGHierarchicalClusteringS2S", + "BlurbsClusteringS2S", + "BlurbsClusteringS2SFast", + "TenKGnadClusteringP2P", + "TenKGnadClusteringP2PFast", + "TenKGnadClusteringS2S", + "TenKGnadClusteringS2SFast", + "BlurbsClusteringP2P", + "BlurbsClusteringP2PFast", + "RomaniBibleClustering", + "MedrxivClusteringS2S", + "MedrxivClusteringS2SFast", + "BiorxivClusteringS2S", + "BiorxivClusteringS2SFast", + "StackExchangeClustering", + "StackExchangeClusteringFast", + "RedditClustering", + "RedditFastClusteringS2S", + "ArxivClusteringS2S", + "ArxivClusteringP2P", + "ArxivClusteringP2PFast", + "MedrxivClusteringP2P", + "MedrxivClusteringP2PFast", + "WikipediaChemistryTopicsClustering", + "WikiCitiesClustering", + "BiorxivClusteringP2P", + "BiorxivClusteringP2PFast", + "TwentyNewsgroupsClustering", + "TwentyNewsgroupsClusteringFast", + "ArXivHierarchicalClusteringP2P", + "ArXivHierarchicalClusteringS2S", + "WikipediaChemistrySpecialtiesClustering", + "BigPatentClustering", + "BigPatentClusteringFast", + "StackExchangeClusteringP2P", + "StackExchangeClusteringP2PFast", + "RedditClusteringP2P", + "RedditFastClusteringP2P", + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", + "LivedoorNewsClustering", + "LivedoorNewsClusteringv2", + "MewsC16JaClustering", + "WikiClusteringFastP2P", + "WikiClusteringP2P", + "MLSUMClusteringS2S", + "MLSUMClusteringS2SFast", + "MasakhaNEWSClusteringS2S", + "MLSUMClusteringP2P", + "MLSUMClusteringP2PFast", + "IndicReviewsClusteringP2P", + "SIB200ClusteringFast", + "MasakhaNEWSClusteringP2P", + "GeoreviewClusteringP2P", + "RuSciBenchGRNTIClusteringP2P", + "RuSciBenchOECDClusteringP2P", + "HALClusteringS2S", + "HALClusteringS2SFast", + "AlloProfClusteringS2S", + "AlloProfClusteringS2SFast", + "AlloProfClusteringP2P", + "AlloProfClusteringP2PFast", + "SummEvalSummarization", + "SummEvalSummarizationv2", + "SummEvalFrSummarization", + "SummEvalFrSummarizationv2", + "GPUSpeedTask", + "CPUSpeedTask", + "News21InstructionRetrieval", + "Core17InstructionRetrieval", + "Robust04InstructionRetrieval", + "mFollowIR", + "mFollowIRCrossLingual", + "BrazilianToxicTweetsClassification", + "MalteseNewsClassification", + "KorHateSpeechMLClassification", + "MultiEURLEXMultilabelClassification", + "SensitiveTopicsClassification", + "CEDRClassification", + "TbilisiCityHallBitextMining", + "VieMedEVBitextMining", + "BornholmBitextMining", + "SRNCorpusBitextMining", + "PubChemSMILESBitextMining", + "IN22ConvBitextMining", + "IN22GenBitextMining", + "BUCCBitextMining", + "LinceMTBitextMining", + "NusaTranslationBitextMining", + "DiaBLaBitextMining", + "NTREXBitextMining", + "IndicGenBenchFloresBitextMining", + "NollySentiBitextMining", + "BUCCBitextMiningFast", + "PhincBitextMining", + "TatoebaBitextMining", + "NusaXBitextMining", + "IWSLT2017BitextMining", + "BibleNLPBitextMining", + "FloresBitextMining", + "RomaTalesBitextMining", + "NorwegianCourtsBitextMining", + "Cmnli", + "Ocnli", + "Assin2RTE", + "SickBrPC", + "CdscePC", + "PpcPC", + "PscPC", + "SickePLPC", + "IndoNLI", + "FalseFriendsDeEnPC", + "ArEntail", + "ArmenianParaphrasePC", + "CTKFactsNLI", + "PubChemSMILESPC", + "PubChemSynonymPC", + "LegalBenchPC", + "TwitterSemEval2015PC", + "PubChemWikiParagraphsPC", + "TwitterURLCorpusPC", + "SprintDuplicateQuestionsPC", + "PubChemAISentenceParaphrasePC", + "FarsTail", + "CExaPPC", + "FarsiParaphraseDetection", + "ParsinluEntail", + "ParsinluQueryParaphPC", + "SynPerChatbotRAGFAQPC", + "SynPerQAPC", + "SynPerTextKeywordsPC", + "KlueNLI", + "IndicXnliPairClassification", + "OpusparcusPC", + "PawsXPairClassification", + "RTE3", + "PubChemWikiPairClassification", + "XStance", + "XNLI", + "XNLIV2", + "TERRa", + "TeluguAndhraJyotiNewsClassification", + "IFlyTek", + "JDReview", + "MultilingualSentiment", + "OnlineShopping", + "TNews", + "Waimai", + "YueOpenriceReviewClassification", + "HateSpeechPortugueseClassification", + "SpanishNewsClassification", + "SpanishSentimentClassification", + "AllegroReviewsClassification", + "CbdClassification", + "PacClassification", + "PolEmo2InClassification", + "PolEmo2OutClassification", + "BulgarianStoreReviewSentimentClassfication", + "KurdishSentimentClassification", + "ItaCaseholdClassification", + "ItalianLinguisticAcceptabilityClassification", + "GeorgianSentimentClassification", + "DalajClassification", + "SweRecClassification", + "SwedishSentimentClassification", + "CSFDSKMovieReviewSentimentClassification", + "SlovakHateSpeechClassification", + "NorwegianParliamentClassification", + "NoRecClassification", + "FilipinoHateSpeechClassification", + "FilipinoShopeeReviewsClassification", + "MarathiNewsClassification", + "IndonesianIdClickbaitClassification", + "IndonesianMongabayConservationClassification", + "UrduRomanSentimentClassification", + "MacedonianTweetSentimentClassification", + "FrenkSlClassification", + "SwahiliNewsClassification", + "FinToxicityClassification", + "KannadaNewsClassification", + "TenKGnadClassification", + "GermanPoliticiansTwitterSentimentClassification", + "PunjabiNewsClassification", + "TswanaNewsClassification", + "TweetSarcasmClassification", + "TweetEmotionClassification", + "RestaurantReviewSentimentClassification", + "HotelReviewSentimentClassification", + "OnlineStoreReviewSentimentClassification", + "AJGT", + "TurkishProductSentimentClassification", + "TurkishMovieSentimentClassification", + "NepaliNewsClassification", + "VieStudentFeedbackClassification", + "DutchBookReviewSentimentClassification", + "SiswatiNewsClassification", + "UkrFormalityClassification", + "SanskritShlokasClassification", + "SlovakMovieReviewSentimentClassification", + "AngryTweetsClassification", + "DdiscoCohesionClassification", + "DanishPoliticalCommentsClassification", + "DKHateClassification", + "LccSentimentClassification", + "TamilNewsClassification", + "CSFDCZMovieReviewSentimentClassification", + "CzechSubjectivityClassification", + "CzechProductReviewSentimentClassification", + "CzechSoMeSentimentClassification", + "EstonianValenceClassification", + "MyanmarNews", + "JavaneseIMDBClassification", + "YahooAnswersTopicsClassification", + "CUADAffiliateLicenseLicenseeLegalBenchClassification", + "CUADAffiliateLicenseLicensorLegalBenchClassification", + "CUADAntiAssignmentLegalBenchClassification", + "CUADAuditRightsLegalBenchClassification", + "CUADCapOnLiabilityLegalBenchClassification", + "CUADChangeOfControlLegalBenchClassification", + "CUADCompetitiveRestrictionExceptionLegalBenchClassification", + "CUADCovenantNotToSueLegalBenchClassification", + "CUADEffectiveDateLegalBenchClassification", + "CUADExclusivityLegalBenchClassification", + "CUADExpirationDateLegalBenchClassification", + "CUADGoverningLawLegalBenchClassification", + "CUADIPOwnershipAssignmentLegalBenchClassification", + "CUADInsuranceLegalBenchClassification", + "CUADIrrevocableOrPerpetualLicenseLegalBenchClassification", + "CUADJointIPOwnershipLegalBenchClassification", + "CUADLicenseGrantLegalBenchClassification", + "CUADLiquidatedDamagesLegalBenchClassification", + "CUADMinimumCommitmentLegalBenchClassification", + "CUADMostFavoredNationLegalBenchClassification", + "CUADNoSolicitOfCustomersLegalBenchClassification", + "CUADNoSolicitOfEmployeesLegalBenchClassification", + "CUADNonCompeteLegalBenchClassification", + "CUADNonDisparagementLegalBenchClassification", + "CUADNonTransferableLicenseLegalBenchClassification", + "CUADNoticePeriodToTerminateRenewalLegalBenchClassification", + "CUADPostTerminationServicesLegalBenchClassification", + "CUADPriceRestrictionsLegalBenchClassification", + "CUADRenewalTermLegalBenchClassification", + "CUADRevenueProfitSharingLegalBenchClassification", + "CUADRofrRofoRofnLegalBenchClassification", + "CUADSourceCodeEscrowLegalBenchClassification", + "CUADTerminationForConvenienceLegalBenchClassification", + "CUADThirdPartyBeneficiaryLegalBenchClassification", + "CUADUncappedLiabilityLegalBenchClassification", + "CUADUnlimitedAllYouCanEatLicenseLegalBenchClassification", + "CUADVolumeRestrictionLegalBenchClassification", + "CUADWarrantyDurationLegalBenchClassification", + "CanadaTaxCourtOutcomesLegalBenchClassification", + "ContractNLIConfidentialityOfAgreementLegalBenchClassification", + "ContractNLIExplicitIdentificationLegalBenchClassification", + "ContractNLIInclusionOfVerballyConveyedInformationLegalBenchClassification", + "ContractNLILimitedUseLegalBenchClassification", + "ContractNLINoLicensingLegalBenchClassification", + "ContractNLINoticeOnCompelledDisclosureLegalBenchClassification", + "ContractNLIPermissibleAcquirementOfSimilarInformationLegalBenchClassification", + "ContractNLIPermissibleCopyLegalBenchClassification", + "ContractNLIPermissibleDevelopmentOfSimilarInformationLegalBenchClassification", + "ContractNLIPermissiblePostAgreementPossessionLegalBenchClassification", + "ContractNLIReturnOfConfidentialInformationLegalBenchClassification", + "ContractNLISharingWithEmployeesLegalBenchClassification", + "ContractNLISharingWithThirdPartiesLegalBenchClassification", + "ContractNLISurvivalOfObligationsLegalBenchClassification", + "CorporateLobbyingLegalBenchClassification", + "DefinitionClassificationLegalBenchClassification", + "Diversity1LegalBenchClassification", + "Diversity2LegalBenchClassification", + "Diversity3LegalBenchClassification", + "Diversity4LegalBenchClassification", + "Diversity5LegalBenchClassification", + "Diversity6LegalBenchClassification", + "FunctionOfDecisionSectionLegalBenchClassification", + "InsurancePolicyInterpretationLegalBenchClassification", + "InternationalCitizenshipQuestionsLegalBenchClassification", + "JCrewBlockerLegalBenchClassification", + "LearnedHandsBenefitsLegalBenchClassification", + "LearnedHandsBusinessLegalBenchClassification", + "LearnedHandsConsumerLegalBenchClassification", + "LearnedHandsCourtsLegalBenchClassification", + "LearnedHandsCrimeLegalBenchClassification", + "LearnedHandsDivorceLegalBenchClassification", + "LearnedHandsDomesticViolenceLegalBenchClassification", + "LearnedHandsEducationLegalBenchClassification", + "LearnedHandsEmploymentLegalBenchClassification", + "LearnedHandsEstatesLegalBenchClassification", + "LearnedHandsFamilyLegalBenchClassification", + "LearnedHandsHealthLegalBenchClassification", + "LearnedHandsHousingLegalBenchClassification", + "LearnedHandsImmigrationLegalBenchClassification", + "LearnedHandsTortsLegalBenchClassification", + "LearnedHandsTrafficLegalBenchClassification", + "LegalReasoningCausalityLegalBenchClassification", + "MAUDLegalBenchClassification", + "NYSJudicialEthicsLegalBenchClassification", + "OPP115DataRetentionLegalBenchClassification", + "OPP115DataSecurityLegalBenchClassification", + "OPP115DoNotTrackLegalBenchClassification", + "OPP115FirstPartyCollectionUseLegalBenchClassification", + "OPP115InternationalAndSpecificAudiencesLegalBenchClassification", + "OPP115PolicyChangeLegalBenchClassification", + "OPP115ThirdPartySharingCollectionLegalBenchClassification", + "OPP115UserAccessEditAndDeletionLegalBenchClassification", + "OPP115UserChoiceControlLegalBenchClassification", + "OralArgumentQuestionPurposeLegalBenchClassification", + "OverrulingLegalBenchClassification", + "PROALegalBenchClassification", + "PersonalJurisdictionLegalBenchClassification", + "SCDBPAccountabilityLegalBenchClassification", + "SCDBPAuditsLegalBenchClassification", + "SCDBPCertificationLegalBenchClassification", + "SCDBPTrainingLegalBenchClassification", + "SCDBPVerificationLegalBenchClassification", + "SCDDAccountabilityLegalBenchClassification", + "SCDDAuditsLegalBenchClassification", + "SCDDCertificationLegalBenchClassification", + "SCDDTrainingLegalBenchClassification", + "SCDDVerificationLegalBenchClassification", + "TelemarketingSalesRuleLegalBenchClassification", + "TextualismToolDictionariesLegalBenchClassification", + "TextualismToolPlainLegalBenchClassification", + "UCCVCommonLawLegalBenchClassification", + "UnfairTOSLegalBenchClassification", + "WikipediaChemEngSpecialtiesClassification", + "FinancialPhrasebankClassification", + "DBpediaClassification", + "WikipediaBiolumNeurochemClassification", + "WikipediaBioMetChemClassification", + "WikipediaIsotopesFissionClassification", + "WikipediaLuminescenceClassification", + "WikipediaChemFieldsClassification", + "FrenkEnClassification", + "TweetTopicSingleClassification", + "WikipediaOrganicInorganicClassification", + "WikipediaCrystallographyAnalyticalClassification", + "WikipediaTheoreticalAppliedClassification", + "WikipediaSaltsSemiconductorsClassification", + "AmazonPolarityClassification", + "NewsClassification", + "ToxicChatClassification", + "SDSEyeProtectionClassification", + "YelpReviewFullClassification", + "WikipediaGreenhouseEnantiopureClassification", + "WikipediaSolidStateColloidalClassification", + "ToxicConversationsClassification", + "TweetSentimentExtractionClassification", + "PatentClassification", + "ImdbClassification", + "SDSGlovesClassification", + "WikipediaCompChemSpectroscopyClassification", + "ArxivClassification", + "EmotionClassification", + "PoemSentimentClassification", + "Banking77Classification", + "WikipediaCryobiologySeparationClassification", + "WikipediaChemistryTopicsClassification", + "DeepSentiPers", + "DigikalamagClassification", + "NLPTwitterAnalysisClassification", + "PersianTextEmotion", + "PersianTextTone", + "SIDClassification", + "SentimentDKSF", + "SynPerChatbotConvSAAnger", + "SynPerChatbotConvSAFear", + "SynPerChatbotConvSAFriendship", + "SynPerChatbotConvSAHappiness", + "SynPerChatbotConvSAJealousy", + "SynPerChatbotConvSALove", + "SynPerChatbotConvSASadness", + "SynPerChatbotConvSASatisfaction", + "SynPerChatbotConvSASurprise", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianFoodSentimentClassification", + "HebrewSentimentAnalysis", + "BengaliSentimentAnalysis", + "BengaliDocumentClassification", + "BengaliHateSpeechClassification", + "SinhalaNewsSourceClassification", + "SinhalaNewsClassification", + "WisesightSentimentClassification", + "WongnaiReviewsClassification", + "WRIMEClassification", + "RomanianReviewsSentiment", + "Moroco", + "RomanianSentimentClassification", + "KorSarcasmClassification", + "KorHateClassification", + "KorFin", + "KlueTC", + "IndicLangClassification", + "SouthAfricanLangClassification", + "SwissJudgementClassification", + "AmazonReviewsClassification", + "NaijaSenti", + "TurkicClassification", + "ScalaClassification", + "MultilingualSentimentClassification", + "SIB200Classification", + "NordicLangClassification", + "NusaParagraphTopicClassification", + "CyrillicTurkicLangClassification", + "IndicNLPNewsClassification", + "MassiveScenarioClassification", + "MTOPIntentClassification", + "NusaParagraphEmotionClassification", + "MultiHateClassification", + "AfriSentiClassification", + "IndicSentimentClassification", + "LanguageClassification", + "AfriSentiLangClassification", + "NusaXSentiClassification", + "MTOPDomainClassification", + "HinDialectClassification", + "CataloniaTweetClassification", + "TweetSentimentClassification", + "MassiveIntentClassification", + "AmazonCounterfactualClassification", + "MasakhaNEWSClassification", + "GujaratiNewsClassification", + "IsiZuluNewsClassification", + "KinopoiskClassification", + "HeadlineClassification", + "InappropriatenessClassification", + "RuSciBenchGRNTIClassification", + "RuSciBenchOECDClassification", + "RuReviewsClassification", + "GeoreviewClassification", + "OdiaNewsClassification", + "GreekLegalCodeClassification", + "MovieReviewSentimentClassification", + "FrenchBookReviews", + "FrenkHrClassification", + "HindiDiscourseClassification", + "SentimentAnalysisHindi", + "MalayalamNewsClassification", + "InstructIR", + "CQADupstackAndroidRetrieval", + "CQADupstackEnglishRetrieval", + "CQADupstackGamingRetrieval", + "CQADupstackGisRetrieval", + "CQADupstackMathematicaRetrieval", + "CQADupstackPhysicsRetrieval", + "CQADupstackProgrammersRetrieval", + "CQADupstackRetrieval", + "CQADupstackStatsRetrieval", + "CQADupstackTexRetrieval", + "CQADupstackUnixRetrieval", + "CQADupstackWebmastersRetrieval", + "CQADupstackWordpressRetrieval", + "CQADupstackAndroidRetrievalFa", + "CQADupstackEnglishRetrievalFa", + "CQADupstackGamingRetrievalFa", + "CQADupstackGisRetrievalFa", + "CQADupstackMathematicaRetrievalFa", + "CQADupstackPhysicsRetrievalFa", + "CQADupstackProgrammersRetrievalFa", + "CQADupstackRetrievalFa", + "CQADupstackStatsRetrievalFa", + "CQADupstackTexRetrievalFa", + "CQADupstackUnixRetrievalFa", + "CQADupstackWebmastersRetrievalFa", + "CQADupstackWordpressRetrievalFa", + "SynPerChatbotConvSAClassification", + "CmedqaRetrieval", + "CovidRetrieval", + "DuRetrieval", + "EcomRetrieval", + "MMarcoRetrieval", + "MedicalRetrieval", + "T2Retrieval", + "VideoRetrieval", + "LeCaRDv2", + "SpanishPassageRetrievalS2S", + "SpanishPassageRetrievalS2P", + "MSMARCOPL", + "MSMARCOPLHardNegatives", + "SCIDOCSPL", + "SciFactPL", + "ArguAnaPL", + "FiQAPLRetrieval", + "NFCorpusPL", + "QuoraPLRetrieval", + "QuoraPLRetrievalHardNegatives", + "TRECCOVIDPL", + "NQPL", + "NQPLHardNegatives", + "DBPediaPL", + "DBPediaPLHardNegatives", + "HotpotQAPL", + "HotpotQAPLHardNegatives", + "GeorgianFAQRetrieval", + "SwednRetrieval", + "SweFaqRetrieval", + "SlovakSumRetrieval", + "SKQuadRetrieval", + "SNLRetrieval", + "NorQuadRetrieval", + "GermanQuADRetrieval", + "GerDaLIRSmall", + "GermanDPR", + "GermanGovServiceRetrieval", + "LegalQuAD", + "GerDaLIR", + "SadeemQuestionRetrieval", + "TurHistQuadRetrieval", + "VieQuADRetrieval", + "DanFever", + "DanFeverRetrieval", + "TV2Nordretrieval", + "TwitterHjerneRetrieval", + "EstQA", + "Quail", + "Touche2020", + "Touche2020v3Retrieval", + "TempReasonL2Pure", + "LegalSummarization", + "NQ", + "NQHardNegatives", + "SIQA", + "MSMARCO", + "MSMARCOHardNegatives", + "DBPedia", + "DBPediaHardNegatives", + "NarrativeQARetrieval", + "MSMARCOv2", + "TRECCOVID", + "WinoGrande", + "QuoraRetrieval", + "QuoraRetrievalHardNegatives", + "AlphaNLI", + "LEMBNeedleRetrieval", + "LEMBPasskeyRetrieval", + "TempReasonL2Context", + "NanoDBPediaRetrieval", + "ARCChallenge", + "ChemHotpotQARetrieval", + "LegalBenchCorporateLobbying", + "SCIDOCS", + "MedicalQARetrieval", + "RARbCode", + "LEMBQMSumRetrieval", + "TempReasonL3Context", + "AILAStatutes", + "TopiOCQARetrieval", + "TopiOCQARetrievalHardNegatives", + "ClimateFEVER", + "ClimateFEVERHardNegatives", + "NanoTouche2020Retrieval", + "MLQuestionsRetrieval", + "TempReasonL2Fact", + "NanoSciFactRetrieval", + "NanoFiQA2018Retrieval", + "TempReasonL3Pure", + "FiQA2018", + "LitSearchRetrieval", + "NanoFEVERRetrieval", + "NanoMSMARCORetrieval", + "FeedbackQARetrieval", + "HagridRetrieval", + "NanoNFCorpusRetrieval", + "FaithDialRetrieval", + "SciFact", + "RARbMath", + "NanoNQRetrieval", + "HellaSwag", + "PIQA", + "SpartQA", + "BrightRetrieval", + "TempReasonL1", + "HotpotQA", + "HotpotQAHardNegatives", + "NanoClimateFeverRetrieval", + "NanoQuoraRetrieval", + "NanoArguAnaRetrieval", + "LegalBenchConsumerContractsQA", + "NanoHotpotQARetrieval", + "ArguAna", + "LEMBWikimQARetrieval", + "TempReasonL3Fact", + "FEVER", + "FEVERHardNegatives", + "NanoSCIDOCSRetrieval", + "AILACasedocs", + "NFCorpus", + "ChemNQRetrieval", + "LEMBSummScreenFDRetrieval", + "LEMBNarrativeQARetrieval", + "PersianWebDocumentRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerQARetrieval", + "ArguAnaFa", + "ClimateFEVERFa", + "DBPediaFa", + "FiQA2018Fa", + "HotpotQAFa", + "MSMARCOFa", + "NFCorpusFa", + "NQFa", + "QuoraRetrievalFa", + "SCIDOCSFa", + "SciFactFa", + "TRECCOVIDFa", + "Touche2020Fa", + "JaGovFaqsRetrieval", + "NLPJournalAbsIntroRetrieval", + "JaqketRetrieval", + "NLPJournalTitleAbsRetrieval", + "JaQuADRetrieval", + "NLPJournalTitleIntroRetrieval", + "HunSum2AbstractiveRetrieval", + "AutoRAGRetrieval", + "KoStrategyQA", + "WikipediaRetrievalMultilingual", + "MintakaRetrieval", + "PublicHealthQARetrieval", + "CrossLingualSemanticDiscriminationWMT19", + "MultiLongDocRetrieval", + "MIRACLRetrieval", + "MIRACLRetrievalHardNegatives", + "NeuCLIR2022Retrieval", + "NeuCLIR2022RetrievalHardNegatives", + "StatcanDialogueDatasetRetrieval", + "IndicQARetrieval", + "NeuCLIR2023Retrieval", + "NeuCLIR2023RetrievalHardNegatives", + "CrossLingualSemanticDiscriminationWMT21", + "XMarket", + "XPQARetrieval", + "BelebeleRetrieval", + "CUREv1Retrieval", + "MLQARetrieval", + "XQuADRetrieval", + "MrTidyRetrieval", + "CodeTransOceanContestRetrieval", + "CodeTransOceanDLRetrieval", + "CodeFeedbackMT", + "CodeRAGLibraryDocumentationSolutionsRetrieval", + "CodeRAGOnlineTutorialsRetrieval", + "CodeRAGProgrammingSolutionsRetrieval", + "CodeRAGStackoverflowPostsRetrieval", + "CodeSearchNetCCRetrieval", + "StackOverflowQARetrieval", + "CodeFeedbackST", + "CosQARetrieval", + "CodeEditSearchRetrieval", + "SyntheticText2SQLRetrieval", + "AppsRetrieval", + "CodeSearchNetRetrieval", + "COIRCodeSearchNetRetrieval", + "RiaNewsRetrieval", + "RiaNewsRetrievalHardNegatives", + "RuBQRetrieval", + "GreekCivicsQA", + "AlloprofRetrieval", + "BSARDRetrieval", + "SyntecRetrieval", + "FQuADRetrieval", + "SAMSumFa", + "SynPerChatbotRAGSumSRetrieval", + "SynPerChatbotSumSRetrieval", + "AFQMC", + "ATEC", + "BQ", + "LCQMC", + "PAWSX", + "QBQTC", + "STSB", + "Assin2STS", + "SickBrSTS", + "STSES", + "CdscrSTS", + "SickrPLSTS", + "FinParaSTS", + "GermanSTSBenchmarkSTS", + "STS12STS", + "STS13STS", + "BiossesSTS", + "STS15STS", + "STSBenchmarkSTS", + "SickrSTS", + "STS16STS", + "STS14STS", + "Farsick", + "Query2Query", + "SynPerSTS", + "FaroeseSTS", + "JSICK", + "JSTS", + "RonSTS", + "KorSTS", + "KlueSTS", + "IndicCrosslingualSTS", + "SemRel24STS", + "STS17Crosslingual", + "STS22CrosslingualSTS", + "STS22CrosslingualSTSv2", + "STSBenchmarkMultilingualSTS", + "RUParaPhraserSTS", + "RuSTSBenchmarkSTS", + "SickFrSTS", + "CMedQAv1", + "CMedQAv2", + "MMarcoReranking", + "T2Reranking", + "NamaaMrTydiReranking", + "AskUbuntuDupQuestions", + "WebLINXCandidatesReranking", + "StackOverflowDupQuestions", + "NevIR", + "MindSmallReranking", + "SciDocsReranking", + "VoyageMMarcoReranking", + "MIRACLReranking", + "ESCIReranking", + "WikipediaRerankingMultilingual", + "RuBQReranking", + "SyntecReranking", + "AlloprofReranking", +] diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py index 5333db7916..c363375144 100644 --- a/mteb/tasks/aggregated_tasks/__init__.py +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -1,8 +1,14 @@ from __future__ import annotations -from .CQADupStackRetrieval import CQADupstackRetrieval -from .CQADupStackRetrievalFa import CQADupstackRetrievalFa -from .SynPerChatbotConvSAClassification import SynPerChatbotConvSAClassification +from .CQADupStackRetrieval import ( + CQADupstackRetrieval, +) +from .CQADupStackRetrievalFa import ( + CQADupstackRetrievalFa, +) +from .SynPerChatbotConvSAClassification import ( + SynPerChatbotConvSAClassification, +) __all__ = [ "CQADupstackRetrieval", diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index c880eaf3ed..9a5843bbfd 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,13 +8,12 @@ import pytest import mteb -from mteb.abstasks import AbsTask, MultilingualTask +from mteb.abstasks import AbsTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask -from mteb.overview import TASKS_REGISTRY, get_tasks from mteb.abstasks.aggregated_task import AbsTaskAggregate -from mteb.overview import TASKS_REGISTRY +from mteb.overview import TASKS_REGISTRY, get_tasks from ..test_benchmark.task_grid import MOCK_TASK_TEST_GRID_AS_STRING From 4f2ce0373810988fbaf707e4bdbe46f168199b4a Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 16:39:51 +0300 Subject: [PATCH 267/275] update generate_imports script for aggregational tasks --- mteb/tasks/__init__.py | 48 ++++++++++++------------- mteb/tasks/aggregated_tasks/__init__.py | 12 ++----- scripts/generate_imports.py | 10 ++++++ 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 49bc4dc53c..1ee0cd205b 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -1251,32 +1251,8 @@ "SentimentAnalysisHindi", "MalayalamNewsClassification", "InstructIR", - "CQADupstackAndroidRetrieval", - "CQADupstackEnglishRetrieval", - "CQADupstackGamingRetrieval", - "CQADupstackGisRetrieval", - "CQADupstackMathematicaRetrieval", - "CQADupstackPhysicsRetrieval", - "CQADupstackProgrammersRetrieval", "CQADupstackRetrieval", - "CQADupstackStatsRetrieval", - "CQADupstackTexRetrieval", - "CQADupstackUnixRetrieval", - "CQADupstackWebmastersRetrieval", - "CQADupstackWordpressRetrieval", - "CQADupstackAndroidRetrievalFa", - "CQADupstackEnglishRetrievalFa", - "CQADupstackGamingRetrievalFa", - "CQADupstackGisRetrievalFa", - "CQADupstackMathematicaRetrievalFa", - "CQADupstackPhysicsRetrievalFa", - "CQADupstackProgrammersRetrievalFa", "CQADupstackRetrievalFa", - "CQADupstackStatsRetrievalFa", - "CQADupstackTexRetrievalFa", - "CQADupstackUnixRetrievalFa", - "CQADupstackWebmastersRetrievalFa", - "CQADupstackWordpressRetrievalFa", "SynPerChatbotConvSAClassification", "CmedqaRetrieval", "CovidRetrieval", @@ -1340,6 +1316,7 @@ "DBPediaHardNegatives", "NarrativeQARetrieval", "MSMARCOv2", + "CQADupstackTexRetrieval", "TRECCOVID", "WinoGrande", "QuoraRetrieval", @@ -1347,6 +1324,7 @@ "AlphaNLI", "LEMBNeedleRetrieval", "LEMBPasskeyRetrieval", + "CQADupstackAndroidRetrieval", "TempReasonL2Context", "NanoDBPediaRetrieval", "ARCChallenge", @@ -1362,12 +1340,19 @@ "TopiOCQARetrievalHardNegatives", "ClimateFEVER", "ClimateFEVERHardNegatives", + "CQADupstackWordpressRetrieval", + "CQADupstackEnglishRetrieval", "NanoTouche2020Retrieval", + "CQADupstackStatsRetrieval", "MLQuestionsRetrieval", "TempReasonL2Fact", "NanoSciFactRetrieval", + "CQADupstackGamingRetrieval", + "CQADupstackWebmastersRetrieval", "NanoFiQA2018Retrieval", + "CQADupstackUnixRetrieval", "TempReasonL3Pure", + "CQADupstackPhysicsRetrieval", "FiQA2018", "LitSearchRetrieval", "NanoFEVERRetrieval", @@ -1377,6 +1362,7 @@ "NanoNFCorpusRetrieval", "FaithDialRetrieval", "SciFact", + "CQADupstackMathematicaRetrieval", "RARbMath", "NanoNQRetrieval", "HellaSwag", @@ -1396,18 +1382,32 @@ "TempReasonL3Fact", "FEVER", "FEVERHardNegatives", + "CQADupstackGisRetrieval", "NanoSCIDOCSRetrieval", "AILACasedocs", "NFCorpus", "ChemNQRetrieval", "LEMBSummScreenFDRetrieval", "LEMBNarrativeQARetrieval", + "CQADupstackProgrammersRetrieval", "PersianWebDocumentRetrieval", "SynPerChatbotRAGFAQRetrieval", "SynPerChatbotRAGTopicsRetrieval", "SynPerChatbotTopicsRetrieval", "SynPerQARetrieval", "ArguAnaFa", + "CQADupstackAndroidRetrievalFa", + "CQADupstackEnglishRetrievalFa", + "CQADupstackGamingRetrievalFa", + "CQADupstackGisRetrievalFa", + "CQADupstackMathematicaRetrievalFa", + "CQADupstackPhysicsRetrievalFa", + "CQADupstackProgrammersRetrievalFa", + "CQADupstackStatsRetrievalFa", + "CQADupstackTexRetrievalFa", + "CQADupstackUnixRetrievalFa", + "CQADupstackWebmastersRetrievalFa", + "CQADupstackWordpressRetrievalFa", "ClimateFEVERFa", "DBPediaFa", "FiQA2018Fa", diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py index c363375144..5333db7916 100644 --- a/mteb/tasks/aggregated_tasks/__init__.py +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -1,14 +1,8 @@ from __future__ import annotations -from .CQADupStackRetrieval import ( - CQADupstackRetrieval, -) -from .CQADupStackRetrievalFa import ( - CQADupstackRetrievalFa, -) -from .SynPerChatbotConvSAClassification import ( - SynPerChatbotConvSAClassification, -) +from .CQADupStackRetrieval import CQADupstackRetrieval +from .CQADupStackRetrievalFa import CQADupstackRetrievalFa +from .SynPerChatbotConvSAClassification import SynPerChatbotConvSAClassification __all__ = [ "CQADupstackRetrieval", diff --git a/scripts/generate_imports.py b/scripts/generate_imports.py index ae331de9df..4961991034 100644 --- a/scripts/generate_imports.py +++ b/scripts/generate_imports.py @@ -8,6 +8,7 @@ from pathlib import Path from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate BASE_DIR = Path("../mteb/tasks") @@ -19,6 +20,8 @@ def find_task_classes_in_module(full_module_name): except ImportError: return [] + is_module_aggregated = "aggregated_tasks" in full_module_name + task_classes = [] for name, obj in inspect.getmembers(mod, inspect.isclass): if ( @@ -29,6 +32,13 @@ def find_task_classes_in_module(full_module_name): and not obj.__name__.startswith("AbsTask") and not obj.__name__ == "MultilingualTask" ): + if is_module_aggregated: + print( + "aggregated_tasks", obj.__name__, issubclass(obj, AbsTaskAggregate) + ) + if issubclass(obj, AbsTaskAggregate): + task_classes.append(name) + continue task_classes.append(name) return task_classes From 728757bcdd92824229d2acf68ed636d30628a223 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 17:24:43 +0300 Subject: [PATCH 268/275] add descriptive stats --- .../PubChemSMILESBitextMining.json | 69 +++++ .../BitextMining/SAMSumFa.json | 15 + .../SynPerChatbotRAGSumSRetrieval.json | 15 + .../SynPerChatbotSumSRetrieval.json | 15 + .../Classification/DeepSentiPers.json | 50 +++ .../DigikalamagClassification.json | 140 +++++++++ .../NLPTwitterAnalysisClassification.json | 170 ++++++++++ .../Classification/PersianTextEmotion.json | 68 ++++ .../Classification/PersianTextTone.json | 56 ++++ .../SDSEyeProtectionClassification.json | 44 +++ .../SDSGlovesClassification.json | 44 +++ .../Classification/SIDClassification.json | 80 +++++ .../Classification/SentimentDKSF.json | 50 +++ .../SynPerChatbotConvSAAnger.json | 44 +++ .../SynPerChatbotConvSAFear.json | 44 +++ .../SynPerChatbotConvSAFriendship.json | 44 +++ .../SynPerChatbotConvSAHappiness.json | 44 +++ .../SynPerChatbotConvSAJealousy.json | 44 +++ .../SynPerChatbotConvSALove.json | 44 +++ .../SynPerChatbotConvSASadness.json | 44 +++ .../SynPerChatbotConvSASatisfaction.json | 44 +++ .../SynPerChatbotConvSASurprise.json | 44 +++ ...hatbotConvSAToneChatbotClassification.json | 50 +++ ...erChatbotConvSAToneUserClassification.json | 50 +++ ...erChatbotRAGToneChatbotClassification.json | 62 ++++ ...ynPerChatbotRAGToneUserClassification.json | 62 ++++ ...hatbotSatisfactionLevelClassification.json | 62 ++++ ...ynPerChatbotToneChatbotClassification.json | 62 ++++ .../SynPerChatbotToneUserClassification.json | 62 ++++ .../WikipediaBioMetChemClassification.json | 44 +++ ...ikipediaBiolumNeurochemClassification.json | 44 +++ ...pediaChemEngSpecialtiesClassification.json | 56 ++++ .../WikipediaChemFieldsClassification.json | 92 ++++++ ...ikipediaChemistryTopicsClassification.json | 86 ++++++ ...diaCompChemSpectroscopyClassification.json | 44 +++ ...iaCryobiologySeparationClassification.json | 62 ++++ ...ystallographyAnalyticalClassification.json | 44 +++ ...iaGreenhouseEnantiopureClassification.json | 44 +++ ...ikipediaIsotopesFissionClassification.json | 44 +++ .../WikipediaLuminescenceClassification.json | 44 +++ ...kipediaOrganicInorganicClassification.json | 44 +++ ...ediaSaltsSemiconductorsClassification.json | 44 +++ ...ediaSolidStateColloidalClassification.json | 44 +++ ...pediaTheoreticalAppliedClassification.json | 44 +++ .../Clustering/BeytooteClustering.json | 73 +++++ .../Clustering/DigikalamagClustering.json | 37 +++ .../Clustering/HamshahriClustring.json | 157 ++++++++++ .../NLPTwitterAnalysisClustering.json | 94 ++++++ .../Clustering/SIDClustring.json | 40 +++ .../WikipediaChemistryTopicsClustering.json | 43 +++ ...pediaSpecialtiesInChemistryClustering.json | 28 ++ .../PairClassification/CExaPPC.json | 24 ++ .../FarsiParaphraseDetection.json | 24 ++ .../PairClassification/ParsinluEntail.json | 24 ++ .../ParsinluQueryParaphPC.json | 24 ++ .../PubChemAISentenceParaphrasePC.json | 24 ++ .../PairClassification/PubChemSMILESPC.json | 24 ++ .../PairClassification/PubChemSynonymPC.json | 24 ++ .../PubChemWikiPairClassification.json | 290 ++++++++++++++++++ .../PubChemWikiParagraphsPC.json | 24 ++ .../SynPerChatbotRAGFAQPC.json | 24 ++ .../PairClassification/SynPerQAPC.json | 24 ++ .../SynPerTextKeywordsPC.json | 24 ++ .../CodeRAGLibraryDocumentationSolutions.json | 31 ++ .../Reranking/CodeRAGOnlineTutorials.json | 31 ++ .../CodeRAGProgrammingSolutions.json | 31 ++ .../Retrieval/ArguAna-Fa.json | 31 ++ .../CQADupstackAndroidRetrieval-Fa.json | 31 ++ .../CQADupstackEnglishRetrieval-Fa.json | 31 ++ .../CQADupstackGamingRetrieval-Fa.json | 31 ++ .../Retrieval/CQADupstackGisRetrieval-Fa.json | 31 ++ .../CQADupstackMathematicaRetrieval-Fa.json | 31 ++ .../CQADupstackPhysicsRetrieval-Fa.json | 31 ++ .../CQADupstackProgrammersRetrieval-Fa.json | 31 ++ .../CQADupstackStatsRetrieval-Fa.json | 31 ++ .../Retrieval/CQADupstackTexRetrieval-Fa.json | 31 ++ .../CQADupstackUnixRetrieval-Fa.json | 31 ++ .../CQADupstackWebmastersRetrieval-Fa.json | 31 ++ .../CQADupstackWordpressRetrieval-Fa.json | 31 ++ .../Retrieval/ChemHotpotQARetrieval.json | 89 ++++++ .../Retrieval/ChemNQRetrieval.json | 31 ++ .../Retrieval/FiQA2018-Fa.json | 31 ++ .../Retrieval/NFCorpus-Fa.json | 31 ++ .../PersianWebDocumentRetrieval.json | 31 ++ .../Retrieval/QuoraRetrieval-Fa.json | 31 ++ .../Retrieval/SCIDOCS-Fa.json | 31 ++ .../Retrieval/SciFact-Fa.json | 31 ++ .../SynPerChatbotRAGFAQRetrieval.json | 31 ++ .../SynPerChatbotRAGTopicsRetrieval.json | 31 ++ .../SynPerChatbotTopicsRetrieval.json | 31 ++ .../Retrieval/SynPerQARetrieval.json | 31 ++ .../Retrieval/TRECCOVID-Fa.json | 31 ++ mteb/descriptive_stats/STS/Farsick.json | 18 ++ mteb/descriptive_stats/STS/Query2Query.json | 18 ++ mteb/descriptive_stats/STS/SynPerSTS.json | 18 ++ .../PairClassification/eng/PubChemSMILESPC.py | 2 +- 96 files changed, 4440 insertions(+), 1 deletion(-) create mode 100644 mteb/descriptive_stats/BitextMining/PubChemSMILESBitextMining.json create mode 100644 mteb/descriptive_stats/BitextMining/SAMSumFa.json create mode 100644 mteb/descriptive_stats/BitextMining/SynPerChatbotRAGSumSRetrieval.json create mode 100644 mteb/descriptive_stats/BitextMining/SynPerChatbotSumSRetrieval.json create mode 100644 mteb/descriptive_stats/Classification/DeepSentiPers.json create mode 100644 mteb/descriptive_stats/Classification/DigikalamagClassification.json create mode 100644 mteb/descriptive_stats/Classification/NLPTwitterAnalysisClassification.json create mode 100644 mteb/descriptive_stats/Classification/PersianTextEmotion.json create mode 100644 mteb/descriptive_stats/Classification/PersianTextTone.json create mode 100644 mteb/descriptive_stats/Classification/SDSEyeProtectionClassification.json create mode 100644 mteb/descriptive_stats/Classification/SDSGlovesClassification.json create mode 100644 mteb/descriptive_stats/Classification/SIDClassification.json create mode 100644 mteb/descriptive_stats/Classification/SentimentDKSF.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAAnger.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAFear.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAFriendship.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAHappiness.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAJealousy.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSALove.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSASadness.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSASatisfaction.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSASurprise.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneChatbotClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneUserClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotRAGToneChatbotClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotRAGToneUserClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotSatisfactionLevelClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotToneChatbotClassification.json create mode 100644 mteb/descriptive_stats/Classification/SynPerChatbotToneUserClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaBioMetChemClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaBiolumNeurochemClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaChemEngSpecialtiesClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaChemFieldsClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaChemistryTopicsClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaCompChemSpectroscopyClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaCryobiologySeparationClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaCrystallographyAnalyticalClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaGreenhouseEnantiopureClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaIsotopesFissionClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaLuminescenceClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaOrganicInorganicClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaSaltsSemiconductorsClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaSolidStateColloidalClassification.json create mode 100644 mteb/descriptive_stats/Classification/WikipediaTheoreticalAppliedClassification.json create mode 100644 mteb/descriptive_stats/Clustering/BeytooteClustering.json create mode 100644 mteb/descriptive_stats/Clustering/DigikalamagClustering.json create mode 100644 mteb/descriptive_stats/Clustering/HamshahriClustring.json create mode 100644 mteb/descriptive_stats/Clustering/NLPTwitterAnalysisClustering.json create mode 100644 mteb/descriptive_stats/Clustering/SIDClustring.json create mode 100644 mteb/descriptive_stats/Clustering/WikipediaChemistryTopicsClustering.json create mode 100644 mteb/descriptive_stats/Clustering/WikipediaSpecialtiesInChemistryClustering.json create mode 100644 mteb/descriptive_stats/PairClassification/CExaPPC.json create mode 100644 mteb/descriptive_stats/PairClassification/FarsiParaphraseDetection.json create mode 100644 mteb/descriptive_stats/PairClassification/ParsinluEntail.json create mode 100644 mteb/descriptive_stats/PairClassification/ParsinluQueryParaphPC.json create mode 100644 mteb/descriptive_stats/PairClassification/PubChemAISentenceParaphrasePC.json create mode 100644 mteb/descriptive_stats/PairClassification/PubChemSMILESPC.json create mode 100644 mteb/descriptive_stats/PairClassification/PubChemSynonymPC.json create mode 100644 mteb/descriptive_stats/PairClassification/PubChemWikiPairClassification.json create mode 100644 mteb/descriptive_stats/PairClassification/PubChemWikiParagraphsPC.json create mode 100644 mteb/descriptive_stats/PairClassification/SynPerChatbotRAGFAQPC.json create mode 100644 mteb/descriptive_stats/PairClassification/SynPerQAPC.json create mode 100644 mteb/descriptive_stats/PairClassification/SynPerTextKeywordsPC.json create mode 100644 mteb/descriptive_stats/Reranking/CodeRAGLibraryDocumentationSolutions.json create mode 100644 mteb/descriptive_stats/Reranking/CodeRAGOnlineTutorials.json create mode 100644 mteb/descriptive_stats/Reranking/CodeRAGProgrammingSolutions.json create mode 100644 mteb/descriptive_stats/Retrieval/ArguAna-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackAndroidRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackEnglishRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackGamingRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackGisRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackMathematicaRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackPhysicsRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackProgrammersRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackStatsRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackTexRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackUnixRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackWebmastersRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/CQADupstackWordpressRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/ChemHotpotQARetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/ChemNQRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/FiQA2018-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/NFCorpus-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/PersianWebDocumentRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/QuoraRetrieval-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/SCIDOCS-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/SciFact-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/SynPerChatbotRAGFAQRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/SynPerChatbotRAGTopicsRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/SynPerChatbotTopicsRetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/SynPerQARetrieval.json create mode 100644 mteb/descriptive_stats/Retrieval/TRECCOVID-Fa.json create mode 100644 mteb/descriptive_stats/STS/Farsick.json create mode 100644 mteb/descriptive_stats/STS/Query2Query.json create mode 100644 mteb/descriptive_stats/STS/SynPerSTS.json diff --git a/mteb/descriptive_stats/BitextMining/PubChemSMILESBitextMining.json b/mteb/descriptive_stats/BitextMining/PubChemSMILESBitextMining.json new file mode 100644 index 0000000000..09694d8a28 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/PubChemSMILESBitextMining.json @@ -0,0 +1,69 @@ +{ + "test": { + "num_samples": 45054, + "number_of_characters": 6143970, + "unique_pairs": 45054, + "min_sentence1_length": 4, + "average_sentence1_length": 91.97886980068363, + "max_sentence1_length": 1129, + "unique_sentence1": 45053, + "min_sentence2_length": 1, + "average_sentence2_length": 44.39015403737737, + "max_sentence2_length": 503, + "unique_sentence2": 22527, + "hf_subset_descriptive_stats": { + "iso-title": { + "num_samples": 7070, + "number_of_characters": 679453, + "unique_pairs": 7070, + "min_sentence1_length": 4, + "average_sentence1_length": 25.293493635077795, + "max_sentence1_length": 332, + "unique_sentence1": 7070, + "min_sentence2_length": 5, + "average_sentence2_length": 70.81018387553041, + "max_sentence2_length": 503, + "unique_sentence2": 7070 + }, + "iso-desc": { + "num_samples": 7070, + "number_of_characters": 1729810, + "unique_pairs": 7070, + "min_sentence1_length": 16, + "average_sentence1_length": 173.85884016973125, + "max_sentence1_length": 1129, + "unique_sentence1": 7070, + "min_sentence2_length": 5, + "average_sentence2_length": 70.81018387553041, + "max_sentence2_length": 503, + "unique_sentence2": 7070 + }, + "canon-title": { + "num_samples": 15457, + "number_of_characters": 857833, + "unique_pairs": 15457, + "min_sentence1_length": 4, + "average_sentence1_length": 23.192340040111276, + "max_sentence1_length": 320, + "unique_sentence1": 15457, + "min_sentence2_length": 1, + "average_sentence2_length": 32.30568674387009, + "max_sentence2_length": 253, + "unique_sentence2": 15457 + }, + "canon-desc": { + "num_samples": 15457, + "number_of_characters": 2876874, + "unique_pairs": 15457, + "min_sentence1_length": 17, + "average_sentence1_length": 153.81542343274893, + "max_sentence1_length": 1114, + "unique_sentence1": 15457, + "min_sentence2_length": 1, + "average_sentence2_length": 32.30568674387009, + "max_sentence2_length": 253, + "unique_sentence2": 15457 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/SAMSumFa.json b/mteb/descriptive_stats/BitextMining/SAMSumFa.json new file mode 100644 index 0000000000..ee1407a697 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/SAMSumFa.json @@ -0,0 +1,15 @@ +{ + "test": { + "num_samples": 1561, + "number_of_characters": 930587, + "unique_pairs": 1561, + "min_sentence1_length": 46, + "average_sentence1_length": 489.58872517616913, + "max_sentence1_length": 2802, + "unique_sentence1": 1561, + "min_sentence2_length": 14, + "average_sentence2_length": 106.55925688661115, + "max_sentence2_length": 325, + "unique_sentence2": 1561 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/SynPerChatbotRAGSumSRetrieval.json b/mteb/descriptive_stats/BitextMining/SynPerChatbotRAGSumSRetrieval.json new file mode 100644 index 0000000000..002827820d --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/SynPerChatbotRAGSumSRetrieval.json @@ -0,0 +1,15 @@ +{ + "test": { + "num_samples": 1087, + "number_of_characters": 835245, + "unique_pairs": 1087, + "min_sentence1_length": 37, + "average_sentence1_length": 628.5234590616375, + "max_sentence1_length": 2601, + "unique_sentence1": 1087, + "min_sentence2_length": 43, + "average_sentence2_length": 139.87120515179393, + "max_sentence2_length": 284, + "unique_sentence2": 1087 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/BitextMining/SynPerChatbotSumSRetrieval.json b/mteb/descriptive_stats/BitextMining/SynPerChatbotSumSRetrieval.json new file mode 100644 index 0000000000..4f13145796 --- /dev/null +++ b/mteb/descriptive_stats/BitextMining/SynPerChatbotSumSRetrieval.json @@ -0,0 +1,15 @@ +{ + "test": { + "num_samples": 1537, + "number_of_characters": 1701259, + "unique_pairs": 1537, + "min_sentence1_length": 383, + "average_sentence1_length": 949.729342875732, + "max_sentence1_length": 1828, + "unique_sentence1": 1537, + "min_sentence2_length": 68, + "average_sentence2_length": 157.1405335068315, + "max_sentence2_length": 308, + "unique_sentence2": 1537 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/DeepSentiPers.json b/mteb/descriptive_stats/Classification/DeepSentiPers.json new file mode 100644 index 0000000000..2dcb17d55e --- /dev/null +++ b/mteb/descriptive_stats/Classification/DeepSentiPers.json @@ -0,0 +1,50 @@ +{ + "test": { + "num_samples": 1854, + "number_of_characters": 201913, + "number_texts_intersect_with_train": 168, + "min_text_length": 5, + "average_text_length": 108.9066882416397, + "max_text_length": 1531, + "unique_texts": 1835, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 915 + }, + "1": { + "count": 196 + }, + "2": { + "count": 743 + } + } + }, + "train": { + "num_samples": 6320, + "number_of_characters": 641376, + "number_texts_intersect_with_train": null, + "min_text_length": 7, + "average_text_length": 101.48354430379747, + "max_text_length": 1290, + "unique_texts": 5780, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "2": { + "count": 1800 + }, + "1": { + "count": 985 + }, + "0": { + "count": 3535 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/DigikalamagClassification.json b/mteb/descriptive_stats/Classification/DigikalamagClassification.json new file mode 100644 index 0000000000..1b825f458a --- /dev/null +++ b/mteb/descriptive_stats/Classification/DigikalamagClassification.json @@ -0,0 +1,140 @@ +{ + "test": { + "num_samples": 852, + "number_of_characters": 2935825, + "number_texts_intersect_with_train": 0, + "min_text_length": 40, + "average_text_length": 3445.8039906103286, + "max_text_length": 36911, + "unique_texts": 852, + "min_labels_per_text": 5, + "average_label_per_text": 12.762910798122066, + "max_labels_per_text": 14, + "unique_labels": 18, + "labels": { + "\u0633": { + "count": 328 + }, + "\u0644": { + "count": 715 + }, + "\u0627": { + "count": 787 + }, + "\u0645": { + "count": 642 + }, + "\u062a": { + "count": 488 + }, + " ": { + "count": 1470 + }, + "\u0648": { + "count": 1590 + }, + "\u0632": { + "count": 358 + }, + "\u06cc": { + "count": 1975 + }, + "\u0628": { + "count": 408 + }, + "\u0639": { + "count": 289 + }, + "\u06a9": { + "count": 302 + }, + "\u0646": { + "count": 624 + }, + "\u0698": { + "count": 277 + }, + "\u0647": { + "count": 180 + }, + "\u0631": { + "count": 193 + }, + "\u062f": { + "count": 235 + }, + "\u062e": { + "count": 13 + } + } + }, + "train": { + "num_samples": 6896, + "number_of_characters": 23218475, + "number_texts_intersect_with_train": null, + "min_text_length": 114, + "average_text_length": 3366.948230858469, + "max_text_length": 53321, + "unique_texts": 6896, + "min_labels_per_text": 5, + "average_label_per_text": 12.764936194895592, + "max_labels_per_text": 14, + "unique_labels": 18, + "labels": { + "\u0639": { + "count": 2342 + }, + "\u0644": { + "count": 5794 + }, + "\u0645": { + "count": 5194 + }, + " ": { + "count": 11904 + }, + "\u0648": { + "count": 12878 + }, + "\u062a": { + "count": 3961 + }, + "\u06a9": { + "count": 2451 + }, + "\u0646": { + "count": 5046 + }, + "\u0698": { + "count": 2245 + }, + "\u06cc": { + "count": 15977 + }, + "\u0647": { + "count": 1451 + }, + "\u0631": { + "count": 1552 + }, + "\u0633": { + "count": 2654 + }, + "\u0627": { + "count": 6371 + }, + "\u0628": { + "count": 3309 + }, + "\u0632": { + "count": 2897 + }, + "\u062f": { + "count": 1900 + }, + "\u062e": { + "count": 101 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/NLPTwitterAnalysisClassification.json b/mteb/descriptive_stats/Classification/NLPTwitterAnalysisClassification.json new file mode 100644 index 0000000000..b4349d1d10 --- /dev/null +++ b/mteb/descriptive_stats/Classification/NLPTwitterAnalysisClassification.json @@ -0,0 +1,170 @@ +{ + "test": { + "num_samples": 1360, + "number_of_characters": 171909, + "number_texts_intersect_with_train": 2, + "min_text_length": 2, + "average_text_length": 126.40367647058824, + "max_text_length": 4523, + "unique_texts": 1359, + "min_labels_per_text": 5, + "average_label_per_text": 19.13455882352941, + "max_labels_per_text": 30, + "unique_labels": 23, + "labels": { + "w": { + "count": 100 + }, + "e": { + "count": 2014 + }, + "a": { + "count": 2827 + }, + "t": { + "count": 1860 + }, + "h": { + "count": 501 + }, + "r": { + "count": 1155 + }, + "_": { + "count": 2473 + }, + "n": { + "count": 2971 + }, + "d": { + "count": 1463 + }, + "s": { + "count": 1675 + }, + "o": { + "count": 1560 + }, + "l": { + "count": 959 + }, + "u": { + "count": 759 + }, + "m": { + "count": 555 + }, + "i": { + "count": 2078 + }, + "v": { + "count": 405 + }, + "f": { + "count": 350 + }, + "y": { + "count": 400 + }, + "b": { + "count": 200 + }, + "c": { + "count": 764 + }, + "p": { + "count": 400 + }, + "g": { + "count": 454 + }, + "k": { + "count": 100 + } + } + }, + "train": { + "num_samples": 2715, + "number_of_characters": 343076, + "number_texts_intersect_with_train": null, + "min_text_length": 2, + "average_text_length": 126.36316758747698, + "max_text_length": 7000, + "unique_texts": 2712, + "min_labels_per_text": 5, + "average_label_per_text": 19.140331491712708, + "max_labels_per_text": 30, + "unique_labels": 23, + "labels": { + "t": { + "count": 3716 + }, + "r": { + "count": 2307 + }, + "a": { + "count": 5647 + }, + "v": { + "count": 808 + }, + "e": { + "count": 4024 + }, + "l": { + "count": 1916 + }, + "_": { + "count": 4938 + }, + "n": { + "count": 5937 + }, + "d": { + "count": 2923 + }, + "u": { + "count": 1513 + }, + "b": { + "count": 400 + }, + "o": { + "count": 3114 + }, + "k": { + "count": 200 + }, + "s": { + "count": 3339 + }, + "i": { + "count": 4148 + }, + "p": { + "count": 800 + }, + "m": { + "count": 1107 + }, + "f": { + "count": 699 + }, + "y": { + "count": 799 + }, + "g": { + "count": 908 + }, + "c": { + "count": 1524 + }, + "h": { + "count": 999 + }, + "w": { + "count": 200 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/PersianTextEmotion.json b/mteb/descriptive_stats/Classification/PersianTextEmotion.json new file mode 100644 index 0000000000..8e77e2d6d5 --- /dev/null +++ b/mteb/descriptive_stats/Classification/PersianTextEmotion.json @@ -0,0 +1,68 @@ +{ + "test": { + "num_samples": 1390, + "number_of_characters": 220225, + "number_texts_intersect_with_train": 5, + "min_text_length": 11, + "average_text_length": 158.43525179856115, + "max_text_length": 350, + "unique_texts": 1389, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 6, + "labels": { + "2": { + "count": 232 + }, + "1": { + "count": 232 + }, + "5": { + "count": 232 + }, + "4": { + "count": 231 + }, + "3": { + "count": 232 + }, + "0": { + "count": 231 + } + } + }, + "train": { + "num_samples": 5558, + "number_of_characters": 867555, + "number_texts_intersect_with_train": null, + "min_text_length": 2, + "average_text_length": 156.09121986326016, + "max_text_length": 1265, + "unique_texts": 5546, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 6, + "labels": { + "4": { + "count": 927 + }, + "0": { + "count": 927 + }, + "1": { + "count": 926 + }, + "3": { + "count": 926 + }, + "2": { + "count": 926 + }, + "5": { + "count": 926 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/PersianTextTone.json b/mteb/descriptive_stats/Classification/PersianTextTone.json new file mode 100644 index 0000000000..7884c17b77 --- /dev/null +++ b/mteb/descriptive_stats/Classification/PersianTextTone.json @@ -0,0 +1,56 @@ +{ + "test": { + "num_samples": 2928, + "number_of_characters": 648514, + "number_texts_intersect_with_train": 6, + "min_text_length": 41, + "average_text_length": 221.4870218579235, + "max_text_length": 949, + "unique_texts": 2925, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 4, + "labels": { + "3": { + "count": 1680 + }, + "2": { + "count": 374 + }, + "0": { + "count": 459 + }, + "1": { + "count": 415 + } + } + }, + "train": { + "num_samples": 16587, + "number_of_characters": 3664508, + "number_texts_intersect_with_train": null, + "min_text_length": 39, + "average_text_length": 220.92650871164165, + "max_text_length": 1079, + "unique_texts": 16576, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 4, + "labels": { + "1": { + "count": 2434 + }, + "3": { + "count": 9712 + }, + "0": { + "count": 2476 + }, + "2": { + "count": 1965 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SDSEyeProtectionClassification.json b/mteb/descriptive_stats/Classification/SDSEyeProtectionClassification.json new file mode 100644 index 0000000000..5244e9439e --- /dev/null +++ b/mteb/descriptive_stats/Classification/SDSEyeProtectionClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 2000, + "number_of_characters": 7007197, + "number_texts_intersect_with_train": 84, + "min_text_length": 886, + "average_text_length": 3503.5985, + "max_text_length": 12360, + "unique_texts": 1971, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 1995 + }, + "0": { + "count": 5 + } + } + }, + "train": { + "num_samples": 6000, + "number_of_characters": 20958793, + "number_texts_intersect_with_train": null, + "min_text_length": 789, + "average_text_length": 3493.1321666666668, + "max_text_length": 11292, + "unique_texts": 5791, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 5987 + }, + "0": { + "count": 13 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SDSGlovesClassification.json b/mteb/descriptive_stats/Classification/SDSGlovesClassification.json new file mode 100644 index 0000000000..faed2c15f9 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SDSGlovesClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 2000, + "number_of_characters": 7114429, + "number_texts_intersect_with_train": 100, + "min_text_length": 307, + "average_text_length": 3557.2145, + "max_text_length": 10481, + "unique_texts": 1971, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 1992 + }, + "0": { + "count": 8 + } + } + }, + "train": { + "num_samples": 6000, + "number_of_characters": 21134190, + "number_texts_intersect_with_train": null, + "min_text_length": 307, + "average_text_length": 3522.365, + "max_text_length": 13127, + "unique_texts": 5791, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 5971 + }, + "0": { + "count": 29 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SIDClassification.json b/mteb/descriptive_stats/Classification/SIDClassification.json new file mode 100644 index 0000000000..8bb0fd99cf --- /dev/null +++ b/mteb/descriptive_stats/Classification/SIDClassification.json @@ -0,0 +1,80 @@ +{ + "test": { + "num_samples": 3735, + "number_of_characters": 4852536, + "number_texts_intersect_with_train": 2, + "min_text_length": 180, + "average_text_length": 1299.2064257028112, + "max_text_length": 5950, + "unique_texts": 3735, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 8, + "labels": { + "1": { + "count": 1441 + }, + "7": { + "count": 482 + }, + "2": { + "count": 191 + }, + "6": { + "count": 384 + }, + "5": { + "count": 645 + }, + "3": { + "count": 376 + }, + "4": { + "count": 159 + }, + "0": { + "count": 57 + } + } + }, + "train": { + "num_samples": 8712, + "number_of_characters": 11265296, + "number_texts_intersect_with_train": null, + "min_text_length": 159, + "average_text_length": 1293.0780532598715, + "max_text_length": 6245, + "unique_texts": 8712, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 8, + "labels": { + "5": { + "count": 1446 + }, + "1": { + "count": 3361 + }, + "7": { + "count": 1120 + }, + "6": { + "count": 953 + }, + "3": { + "count": 877 + }, + "0": { + "count": 163 + }, + "2": { + "count": 443 + }, + "4": { + "count": 349 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SentimentDKSF.json b/mteb/descriptive_stats/Classification/SentimentDKSF.json new file mode 100644 index 0000000000..ec555f24ee --- /dev/null +++ b/mteb/descriptive_stats/Classification/SentimentDKSF.json @@ -0,0 +1,50 @@ +{ + "test": { + "num_samples": 2315, + "number_of_characters": 276753, + "number_texts_intersect_with_train": 636, + "min_text_length": 2, + "average_text_length": 119.54773218142549, + "max_text_length": 1998, + "unique_texts": 2298, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 1107 + }, + "1": { + "count": 1032 + }, + "2": { + "count": 176 + } + } + }, + "train": { + "num_samples": 28602, + "number_of_characters": 3590436, + "number_texts_intersect_with_train": null, + "min_text_length": 1, + "average_text_length": 125.53094189217538, + "max_text_length": 2867, + "unique_texts": 28299, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 10206 + }, + "1": { + "count": 9587 + }, + "2": { + "count": 8809 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAAnger.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAAnger.json new file mode 100644 index 0000000000..38712f25d3 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAAnger.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 410, + "number_of_characters": 429280, + "number_texts_intersect_with_train": 0, + "min_text_length": 345, + "average_text_length": 1047.0243902439024, + "max_text_length": 1956, + "unique_texts": 410, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 134 + }, + "1": { + "count": 276 + } + } + }, + "train": { + "num_samples": 1170, + "number_of_characters": 1221656, + "number_texts_intersect_with_train": null, + "min_text_length": 389, + "average_text_length": 1044.1504273504274, + "max_text_length": 1930, + "unique_texts": 1170, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 780 + }, + "0": { + "count": 390 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFear.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFear.json new file mode 100644 index 0000000000..bb3ed9442a --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFear.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 117, + "number_of_characters": 118712, + "number_texts_intersect_with_train": 0, + "min_text_length": 518, + "average_text_length": 1014.6324786324786, + "max_text_length": 1763, + "unique_texts": 117, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 71 + }, + "0": { + "count": 46 + } + } + }, + "train": { + "num_samples": 322, + "number_of_characters": 334453, + "number_texts_intersect_with_train": null, + "min_text_length": 447, + "average_text_length": 1038.6739130434783, + "max_text_length": 1803, + "unique_texts": 322, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 101 + }, + "1": { + "count": 221 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFriendship.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFriendship.json new file mode 100644 index 0000000000..4218498f9e --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAFriendship.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 138, + "number_of_characters": 131199, + "number_texts_intersect_with_train": 0, + "min_text_length": 485, + "average_text_length": 950.7173913043479, + "max_text_length": 1679, + "unique_texts": 138, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 92 + }, + "0": { + "count": 46 + } + } + }, + "train": { + "num_samples": 372, + "number_of_characters": 341700, + "number_texts_intersect_with_train": null, + "min_text_length": 369, + "average_text_length": 918.5483870967741, + "max_text_length": 1735, + "unique_texts": 372, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 248 + }, + "0": { + "count": 124 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAHappiness.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAHappiness.json new file mode 100644 index 0000000000..36a0872792 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAHappiness.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 118, + "number_of_characters": 111986, + "number_texts_intersect_with_train": 0, + "min_text_length": 502, + "average_text_length": 949.0338983050848, + "max_text_length": 1625, + "unique_texts": 118, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 80 + }, + "0": { + "count": 38 + } + } + }, + "train": { + "num_samples": 376, + "number_of_characters": 350731, + "number_texts_intersect_with_train": null, + "min_text_length": 434, + "average_text_length": 932.7952127659574, + "max_text_length": 1778, + "unique_texts": 376, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 128 + }, + "1": { + "count": 248 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAJealousy.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAJealousy.json new file mode 100644 index 0000000000..bad22e5f12 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAJealousy.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 29, + "number_of_characters": 30832, + "number_texts_intersect_with_train": 0, + "min_text_length": 584, + "average_text_length": 1063.1724137931035, + "max_text_length": 1437, + "unique_texts": 29, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 11 + }, + "1": { + "count": 18 + } + } + }, + "train": { + "num_samples": 108, + "number_of_characters": 120208, + "number_texts_intersect_with_train": null, + "min_text_length": 595, + "average_text_length": 1113.037037037037, + "max_text_length": 1817, + "unique_texts": 108, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 74 + }, + "0": { + "count": 34 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSALove.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSALove.json new file mode 100644 index 0000000000..f6834bc1bf --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSALove.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 35, + "number_of_characters": 37257, + "number_texts_intersect_with_train": 0, + "min_text_length": 545, + "average_text_length": 1064.4857142857143, + "max_text_length": 1648, + "unique_texts": 35, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 10 + }, + "1": { + "count": 25 + } + } + }, + "train": { + "num_samples": 91, + "number_of_characters": 93261, + "number_texts_intersect_with_train": null, + "min_text_length": 554, + "average_text_length": 1024.8461538461538, + "max_text_length": 1849, + "unique_texts": 91, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 34 + }, + "1": { + "count": 57 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSASadness.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASadness.json new file mode 100644 index 0000000000..2cd4bc2bb7 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASadness.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 102, + "number_of_characters": 107535, + "number_texts_intersect_with_train": 0, + "min_text_length": 582, + "average_text_length": 1054.264705882353, + "max_text_length": 2518, + "unique_texts": 102, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 68 + }, + "0": { + "count": 34 + } + } + }, + "train": { + "num_samples": 334, + "number_of_characters": 338880, + "number_texts_intersect_with_train": null, + "min_text_length": 343, + "average_text_length": 1014.6107784431138, + "max_text_length": 1901, + "unique_texts": 334, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 222 + }, + "0": { + "count": 112 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSASatisfaction.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASatisfaction.json new file mode 100644 index 0000000000..f2d5fdd521 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASatisfaction.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 429, + "number_of_characters": 439936, + "number_texts_intersect_with_train": 0, + "min_text_length": 414, + "average_text_length": 1025.4918414918416, + "max_text_length": 1991, + "unique_texts": 429, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 156 + }, + "1": { + "count": 273 + } + } + }, + "train": { + "num_samples": 1314, + "number_of_characters": 1333578, + "number_texts_intersect_with_train": null, + "min_text_length": 421, + "average_text_length": 1014.8995433789954, + "max_text_length": 2055, + "unique_texts": 1314, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 889 + }, + "0": { + "count": 425 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSASurprise.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASurprise.json new file mode 100644 index 0000000000..a01f9cb4de --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSASurprise.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 121, + "number_of_characters": 122106, + "number_texts_intersect_with_train": 0, + "min_text_length": 428, + "average_text_length": 1009.1404958677685, + "max_text_length": 1665, + "unique_texts": 121, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 76 + }, + "0": { + "count": 45 + } + } + }, + "train": { + "num_samples": 409, + "number_of_characters": 416151, + "number_texts_intersect_with_train": null, + "min_text_length": 414, + "average_text_length": 1017.4841075794621, + "max_text_length": 2012, + "unique_texts": 409, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 277 + }, + "0": { + "count": 132 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneChatbotClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneChatbotClassification.json new file mode 100644 index 0000000000..1e9f17f200 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneChatbotClassification.json @@ -0,0 +1,50 @@ +{ + "test": { + "num_samples": 1499, + "number_of_characters": 1528843, + "number_texts_intersect_with_train": 0, + "min_text_length": 345, + "average_text_length": 1019.9086057371582, + "max_text_length": 2518, + "unique_texts": 1499, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 947 + }, + "1": { + "count": 408 + }, + "2": { + "count": 144 + } + } + }, + "train": { + "num_samples": 4496, + "number_of_characters": 4550618, + "number_texts_intersect_with_train": null, + "min_text_length": 343, + "average_text_length": 1012.1481316725979, + "max_text_length": 2055, + "unique_texts": 4496, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 2821 + }, + "1": { + "count": 1200 + }, + "2": { + "count": 475 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneUserClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneUserClassification.json new file mode 100644 index 0000000000..32d15961b2 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotConvSAToneUserClassification.json @@ -0,0 +1,50 @@ +{ + "test": { + "num_samples": 1499, + "number_of_characters": 1528843, + "number_texts_intersect_with_train": 0, + "min_text_length": 345, + "average_text_length": 1019.9086057371582, + "max_text_length": 2518, + "unique_texts": 1499, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "2": { + "count": 954 + }, + "0": { + "count": 346 + }, + "1": { + "count": 199 + } + } + }, + "train": { + "num_samples": 4496, + "number_of_characters": 4550618, + "number_texts_intersect_with_train": null, + "min_text_length": 343, + "average_text_length": 1012.1481316725979, + "max_text_length": 2055, + "unique_texts": 4496, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 3, + "labels": { + "0": { + "count": 1093 + }, + "1": { + "count": 573 + }, + "2": { + "count": 2830 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneChatbotClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneChatbotClassification.json new file mode 100644 index 0000000000..0e8ee678c8 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneChatbotClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 1087, + "number_of_characters": 683205, + "number_texts_intersect_with_train": 0, + "min_text_length": 37, + "average_text_length": 628.5234590616375, + "max_text_length": 2601, + "unique_texts": 1087, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 328 + }, + "3": { + "count": 503 + }, + "0": { + "count": 117 + }, + "2": { + "count": 64 + }, + "1": { + "count": 75 + } + } + }, + "train": { + "num_samples": 3261, + "number_of_characters": 2056861, + "number_texts_intersect_with_train": null, + "min_text_length": 38, + "average_text_length": 630.7454768475927, + "max_text_length": 2813, + "unique_texts": 3261, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 1042 + }, + "3": { + "count": 1475 + }, + "1": { + "count": 183 + }, + "2": { + "count": 192 + }, + "0": { + "count": 369 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneUserClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneUserClassification.json new file mode 100644 index 0000000000..b919603bf0 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotRAGToneUserClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 1087, + "number_of_characters": 683205, + "number_texts_intersect_with_train": 0, + "min_text_length": 37, + "average_text_length": 628.5234590616375, + "max_text_length": 2601, + "unique_texts": 1087, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "2": { + "count": 472 + }, + "4": { + "count": 276 + }, + "1": { + "count": 132 + }, + "3": { + "count": 57 + }, + "0": { + "count": 150 + } + } + }, + "train": { + "num_samples": 3261, + "number_of_characters": 2056861, + "number_texts_intersect_with_train": null, + "min_text_length": 38, + "average_text_length": 630.7454768475927, + "max_text_length": 2813, + "unique_texts": 3261, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "0": { + "count": 464 + }, + "2": { + "count": 1428 + }, + "4": { + "count": 796 + }, + "1": { + "count": 389 + }, + "3": { + "count": 184 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotSatisfactionLevelClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotSatisfactionLevelClassification.json new file mode 100644 index 0000000000..004d66d5d4 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotSatisfactionLevelClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 1537, + "number_of_characters": 1459734, + "number_texts_intersect_with_train": 0, + "min_text_length": 383, + "average_text_length": 949.729342875732, + "max_text_length": 1828, + "unique_texts": 1537, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "2": { + "count": 249 + }, + "4": { + "count": 491 + }, + "0": { + "count": 254 + }, + "3": { + "count": 291 + }, + "1": { + "count": 252 + } + } + }, + "train": { + "num_samples": 8709, + "number_of_characters": 8156459, + "number_texts_intersect_with_train": null, + "min_text_length": 358, + "average_text_length": 936.5551728097371, + "max_text_length": 2013, + "unique_texts": 8709, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 2903 + }, + "0": { + "count": 1425 + }, + "2": { + "count": 1466 + }, + "1": { + "count": 1472 + }, + "3": { + "count": 1443 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotToneChatbotClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotToneChatbotClassification.json new file mode 100644 index 0000000000..a7e67564d5 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotToneChatbotClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 1537, + "number_of_characters": 1459734, + "number_texts_intersect_with_train": 0, + "min_text_length": 383, + "average_text_length": 949.729342875732, + "max_text_length": 1828, + "unique_texts": 1537, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "3": { + "count": 515 + }, + "2": { + "count": 76 + }, + "4": { + "count": 669 + }, + "0": { + "count": 191 + }, + "1": { + "count": 86 + } + } + }, + "train": { + "num_samples": 8709, + "number_of_characters": 8156459, + "number_texts_intersect_with_train": null, + "min_text_length": 358, + "average_text_length": 936.5551728097371, + "max_text_length": 2013, + "unique_texts": 8709, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "3": { + "count": 2812 + }, + "4": { + "count": 3838 + }, + "0": { + "count": 1031 + }, + "2": { + "count": 488 + }, + "1": { + "count": 540 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/SynPerChatbotToneUserClassification.json b/mteb/descriptive_stats/Classification/SynPerChatbotToneUserClassification.json new file mode 100644 index 0000000000..0807c5fd92 --- /dev/null +++ b/mteb/descriptive_stats/Classification/SynPerChatbotToneUserClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 1537, + "number_of_characters": 1459734, + "number_texts_intersect_with_train": 0, + "min_text_length": 383, + "average_text_length": 949.729342875732, + "max_text_length": 1828, + "unique_texts": 1537, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "0": { + "count": 646 + }, + "3": { + "count": 411 + }, + "4": { + "count": 201 + }, + "1": { + "count": 84 + }, + "2": { + "count": 195 + } + } + }, + "train": { + "num_samples": 8709, + "number_of_characters": 8156459, + "number_texts_intersect_with_train": null, + "min_text_length": 358, + "average_text_length": 936.5551728097371, + "max_text_length": 2013, + "unique_texts": 8709, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 1214 + }, + "0": { + "count": 3828 + }, + "2": { + "count": 1091 + }, + "1": { + "count": 526 + }, + "3": { + "count": 2050 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaBioMetChemClassification.json b/mteb/descriptive_stats/Classification/WikipediaBioMetChemClassification.json new file mode 100644 index 0000000000..99803e04f1 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaBioMetChemClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 1149, + "number_of_characters": 1212998, + "number_texts_intersect_with_train": 3, + "min_text_length": 105, + "average_text_length": 1055.698868581375, + "max_text_length": 10484, + "unique_texts": 1144, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 543 + }, + "0": { + "count": 606 + } + } + }, + "train": { + "num_samples": 4592, + "number_of_characters": 5067588, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 1103.5688153310105, + "max_text_length": 13719, + "unique_texts": 4578, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 2148 + }, + "0": { + "count": 2444 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaBiolumNeurochemClassification.json b/mteb/descriptive_stats/Classification/WikipediaBiolumNeurochemClassification.json new file mode 100644 index 0000000000..fc4342653b --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaBiolumNeurochemClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 98, + "number_of_characters": 88267, + "number_texts_intersect_with_train": 0, + "min_text_length": 113, + "average_text_length": 900.6836734693877, + "max_text_length": 3514, + "unique_texts": 98, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 60 + }, + "1": { + "count": 38 + } + } + }, + "train": { + "num_samples": 388, + "number_of_characters": 392916, + "number_texts_intersect_with_train": null, + "min_text_length": 106, + "average_text_length": 1012.6701030927835, + "max_text_length": 5804, + "unique_texts": 388, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 155 + }, + "0": { + "count": 233 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaChemEngSpecialtiesClassification.json b/mteb/descriptive_stats/Classification/WikipediaChemEngSpecialtiesClassification.json new file mode 100644 index 0000000000..46e23581af --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaChemEngSpecialtiesClassification.json @@ -0,0 +1,56 @@ +{ + "test": { + "num_samples": 124, + "number_of_characters": 92601, + "number_texts_intersect_with_train": 0, + "min_text_length": 114, + "average_text_length": 746.7822580645161, + "max_text_length": 5659, + "unique_texts": 124, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 4, + "labels": { + "3": { + "count": 53 + }, + "1": { + "count": 19 + }, + "0": { + "count": 32 + }, + "2": { + "count": 20 + } + } + }, + "train": { + "num_samples": 493, + "number_of_characters": 466734, + "number_texts_intersect_with_train": null, + "min_text_length": 108, + "average_text_length": 946.7221095334686, + "max_text_length": 6844, + "unique_texts": 493, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 4, + "labels": { + "1": { + "count": 94 + }, + "2": { + "count": 102 + }, + "3": { + "count": 176 + }, + "0": { + "count": 121 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaChemFieldsClassification.json b/mteb/descriptive_stats/Classification/WikipediaChemFieldsClassification.json new file mode 100644 index 0000000000..e2bb504005 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaChemFieldsClassification.json @@ -0,0 +1,92 @@ +{ + "test": { + "num_samples": 8541, + "number_of_characters": 8413866, + "number_texts_intersect_with_train": 16, + "min_text_length": 105, + "average_text_length": 985.1148577449948, + "max_text_length": 4425, + "unique_texts": 8541, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 10, + "labels": { + "1": { + "count": 2649 + }, + "8": { + "count": 575 + }, + "9": { + "count": 338 + }, + "3": { + "count": 685 + }, + "7": { + "count": 1909 + }, + "2": { + "count": 297 + }, + "4": { + "count": 404 + }, + "0": { + "count": 1198 + }, + "5": { + "count": 318 + }, + "6": { + "count": 168 + } + } + }, + "train": { + "num_samples": 34173, + "number_of_characters": 33663720, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 985.0970064085682, + "max_text_length": 4438, + "unique_texts": 34134, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 10, + "labels": { + "0": { + "count": 4717 + }, + "7": { + "count": 7711 + }, + "3": { + "count": 2662 + }, + "1": { + "count": 10567 + }, + "9": { + "count": 1425 + }, + "4": { + "count": 1528 + }, + "8": { + "count": 2423 + }, + "2": { + "count": 1235 + }, + "5": { + "count": 1262 + }, + "6": { + "count": 643 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaChemistryTopicsClassification.json b/mteb/descriptive_stats/Classification/WikipediaChemistryTopicsClassification.json new file mode 100644 index 0000000000..ef271ca639 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaChemistryTopicsClassification.json @@ -0,0 +1,86 @@ +{ + "test": { + "num_samples": 421, + "number_of_characters": 494052, + "number_texts_intersect_with_train": 0, + "min_text_length": 105, + "average_text_length": 1173.520190023753, + "max_text_length": 7002, + "unique_texts": 421, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 9, + "labels": { + "5": { + "count": 53 + }, + "2": { + "count": 70 + }, + "0": { + "count": 43 + }, + "8": { + "count": 49 + }, + "3": { + "count": 41 + }, + "6": { + "count": 37 + }, + "1": { + "count": 54 + }, + "7": { + "count": 38 + }, + "4": { + "count": 36 + } + } + }, + "train": { + "num_samples": 1684, + "number_of_characters": 1884233, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 1118.9032066508314, + "max_text_length": 19539, + "unique_texts": 1684, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 9, + "labels": { + "0": { + "count": 174 + }, + "1": { + "count": 227 + }, + "2": { + "count": 265 + }, + "6": { + "count": 165 + }, + "5": { + "count": 202 + }, + "3": { + "count": 206 + }, + "4": { + "count": 106 + }, + "7": { + "count": 186 + }, + "8": { + "count": 153 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaCompChemSpectroscopyClassification.json b/mteb/descriptive_stats/Classification/WikipediaCompChemSpectroscopyClassification.json new file mode 100644 index 0000000000..dd1cce32ff --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaCompChemSpectroscopyClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 221, + "number_of_characters": 214976, + "number_texts_intersect_with_train": 0, + "min_text_length": 105, + "average_text_length": 972.7420814479638, + "max_text_length": 6538, + "unique_texts": 221, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 98 + }, + "1": { + "count": 123 + } + } + }, + "train": { + "num_samples": 880, + "number_of_characters": 944871, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 1073.7170454545455, + "max_text_length": 10922, + "unique_texts": 880, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 526 + }, + "0": { + "count": 354 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaCryobiologySeparationClassification.json b/mteb/descriptive_stats/Classification/WikipediaCryobiologySeparationClassification.json new file mode 100644 index 0000000000..7257532768 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaCryobiologySeparationClassification.json @@ -0,0 +1,62 @@ +{ + "test": { + "num_samples": 233, + "number_of_characters": 232839, + "number_texts_intersect_with_train": 0, + "min_text_length": 105, + "average_text_length": 999.3090128755365, + "max_text_length": 4118, + "unique_texts": 233, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 43 + }, + "1": { + "count": 61 + }, + "0": { + "count": 45 + }, + "3": { + "count": 46 + }, + "2": { + "count": 38 + } + } + }, + "train": { + "num_samples": 931, + "number_of_characters": 1022703, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 1098.4994629430719, + "max_text_length": 19539, + "unique_texts": 931, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 5, + "labels": { + "4": { + "count": 159 + }, + "1": { + "count": 194 + }, + "0": { + "count": 236 + }, + "3": { + "count": 178 + }, + "2": { + "count": 164 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaCrystallographyAnalyticalClassification.json b/mteb/descriptive_stats/Classification/WikipediaCrystallographyAnalyticalClassification.json new file mode 100644 index 0000000000..acd1f9a9d4 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaCrystallographyAnalyticalClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 291, + "number_of_characters": 306551, + "number_texts_intersect_with_train": 1, + "min_text_length": 122, + "average_text_length": 1053.4398625429553, + "max_text_length": 4627, + "unique_texts": 291, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 128 + }, + "1": { + "count": 163 + } + } + }, + "train": { + "num_samples": 1160, + "number_of_characters": 1356202, + "number_texts_intersect_with_train": null, + "min_text_length": 107, + "average_text_length": 1169.1396551724138, + "max_text_length": 10968, + "unique_texts": 1159, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 616 + }, + "0": { + "count": 544 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaGreenhouseEnantiopureClassification.json b/mteb/descriptive_stats/Classification/WikipediaGreenhouseEnantiopureClassification.json new file mode 100644 index 0000000000..34cd238a25 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaGreenhouseEnantiopureClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 228, + "number_of_characters": 200323, + "number_texts_intersect_with_train": 0, + "min_text_length": 105, + "average_text_length": 878.609649122807, + "max_text_length": 4278, + "unique_texts": 228, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 144 + }, + "0": { + "count": 84 + } + } + }, + "train": { + "num_samples": 908, + "number_of_characters": 755435, + "number_texts_intersect_with_train": null, + "min_text_length": 106, + "average_text_length": 831.9768722466961, + "max_text_length": 11179, + "unique_texts": 908, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 343 + }, + "1": { + "count": 565 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaIsotopesFissionClassification.json b/mteb/descriptive_stats/Classification/WikipediaIsotopesFissionClassification.json new file mode 100644 index 0000000000..fb632c16d6 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaIsotopesFissionClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 84, + "number_of_characters": 105980, + "number_texts_intersect_with_train": 0, + "min_text_length": 152, + "average_text_length": 1261.6666666666667, + "max_text_length": 4136, + "unique_texts": 84, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 28 + }, + "0": { + "count": 56 + } + } + }, + "train": { + "num_samples": 333, + "number_of_characters": 418382, + "number_texts_intersect_with_train": null, + "min_text_length": 118, + "average_text_length": 1256.4024024024025, + "max_text_length": 6585, + "unique_texts": 333, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 156 + }, + "0": { + "count": 177 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaLuminescenceClassification.json b/mteb/descriptive_stats/Classification/WikipediaLuminescenceClassification.json new file mode 100644 index 0000000000..4c1824d0c7 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaLuminescenceClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 82, + "number_of_characters": 74710, + "number_texts_intersect_with_train": 0, + "min_text_length": 121, + "average_text_length": 911.0975609756098, + "max_text_length": 4132, + "unique_texts": 82, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 53 + }, + "1": { + "count": 29 + } + } + }, + "train": { + "num_samples": 328, + "number_of_characters": 318020, + "number_texts_intersect_with_train": null, + "min_text_length": 127, + "average_text_length": 969.5731707317074, + "max_text_length": 5804, + "unique_texts": 328, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 164 + }, + "1": { + "count": 164 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaOrganicInorganicClassification.json b/mteb/descriptive_stats/Classification/WikipediaOrganicInorganicClassification.json new file mode 100644 index 0000000000..68a65191ee --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaOrganicInorganicClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 263, + "number_of_characters": 214783, + "number_texts_intersect_with_train": 0, + "min_text_length": 106, + "average_text_length": 816.6653992395437, + "max_text_length": 3075, + "unique_texts": 263, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 148 + }, + "1": { + "count": 115 + } + } + }, + "train": { + "num_samples": 1049, + "number_of_characters": 811275, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 773.3794089609152, + "max_text_length": 6504, + "unique_texts": 1049, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 595 + }, + "1": { + "count": 454 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaSaltsSemiconductorsClassification.json b/mteb/descriptive_stats/Classification/WikipediaSaltsSemiconductorsClassification.json new file mode 100644 index 0000000000..d88c9453f0 --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaSaltsSemiconductorsClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 99, + "number_of_characters": 83781, + "number_texts_intersect_with_train": 0, + "min_text_length": 113, + "average_text_length": 846.2727272727273, + "max_text_length": 5101, + "unique_texts": 99, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 50 + }, + "0": { + "count": 49 + } + } + }, + "train": { + "num_samples": 392, + "number_of_characters": 318087, + "number_texts_intersect_with_train": null, + "min_text_length": 106, + "average_text_length": 811.4464285714286, + "max_text_length": 5279, + "unique_texts": 392, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 184 + }, + "0": { + "count": 208 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaSolidStateColloidalClassification.json b/mteb/descriptive_stats/Classification/WikipediaSolidStateColloidalClassification.json new file mode 100644 index 0000000000..38a261d91e --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaSolidStateColloidalClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 444, + "number_of_characters": 420855, + "number_texts_intersect_with_train": 0, + "min_text_length": 114, + "average_text_length": 947.8716216216217, + "max_text_length": 5713, + "unique_texts": 444, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 245 + }, + "0": { + "count": 199 + } + } + }, + "train": { + "num_samples": 1772, + "number_of_characters": 1733543, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 978.2974040632054, + "max_text_length": 18971, + "unique_texts": 1772, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "1": { + "count": 987 + }, + "0": { + "count": 785 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Classification/WikipediaTheoreticalAppliedClassification.json b/mteb/descriptive_stats/Classification/WikipediaTheoreticalAppliedClassification.json new file mode 100644 index 0000000000..b916555acf --- /dev/null +++ b/mteb/descriptive_stats/Classification/WikipediaTheoreticalAppliedClassification.json @@ -0,0 +1,44 @@ +{ + "test": { + "num_samples": 11670, + "number_of_characters": 11694561, + "number_texts_intersect_with_train": 20, + "min_text_length": 105, + "average_text_length": 1002.1046272493573, + "max_text_length": 4571, + "unique_texts": 11668, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 6171 + }, + "1": { + "count": 5499 + } + } + }, + "train": { + "num_samples": 46661, + "number_of_characters": 46374313, + "number_texts_intersect_with_train": null, + "min_text_length": 105, + "average_text_length": 993.8559610809884, + "max_text_length": 4574, + "unique_texts": 46625, + "min_labels_per_text": 1, + "average_label_per_text": 1.0, + "max_labels_per_text": 1, + "unique_labels": 2, + "labels": { + "0": { + "count": 24258 + }, + "1": { + "count": 22403 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/BeytooteClustering.json b/mteb/descriptive_stats/Clustering/BeytooteClustering.json new file mode 100644 index 0000000000..588b99326b --- /dev/null +++ b/mteb/descriptive_stats/Clustering/BeytooteClustering.json @@ -0,0 +1,73 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 422686, + "min_text_length": 25, + "average_text_length": 206.3896484375, + "max_text_length": 565, + "unique_texts": 374, + "min_labels_per_text": 26, + "average_labels_per_text": 1.0, + "max_labels_per_text": 271, + "unique_labels": 19, + "labels": { + "4": { + "count": 184 + }, + "12": { + "count": 256 + }, + "6": { + "count": 112 + }, + "5": { + "count": 104 + }, + "0": { + "count": 101 + }, + "13": { + "count": 27 + }, + "15": { + "count": 79 + }, + "2": { + "count": 88 + }, + "10": { + "count": 26 + }, + "7": { + "count": 114 + }, + "1": { + "count": 131 + }, + "16": { + "count": 76 + }, + "9": { + "count": 95 + }, + "17": { + "count": 83 + }, + "11": { + "count": 65 + }, + "3": { + "count": 271 + }, + "14": { + "count": 91 + }, + "18": { + "count": 82 + }, + "8": { + "count": 63 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/DigikalamagClustering.json b/mteb/descriptive_stats/Clustering/DigikalamagClustering.json new file mode 100644 index 0000000000..e88e7c9d45 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/DigikalamagClustering.json @@ -0,0 +1,37 @@ +{ + "test": { + "num_samples": 852, + "number_of_characters": 2935825, + "min_text_length": 40, + "average_text_length": 3445.8039906103286, + "max_text_length": 36911, + "unique_texts": 792, + "min_labels_per_text": 12, + "average_labels_per_text": 1.0, + "max_labels_per_text": 277, + "unique_labels": 7, + "labels": { + "2": { + "count": 161 + }, + "3": { + "count": 277 + }, + "5": { + "count": 167 + }, + "0": { + "count": 197 + }, + "6": { + "count": 25 + }, + "1": { + "count": 13 + }, + "4": { + "count": 12 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/HamshahriClustring.json b/mteb/descriptive_stats/Clustering/HamshahriClustring.json new file mode 100644 index 0000000000..663d5d9cc3 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/HamshahriClustring.json @@ -0,0 +1,157 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 444075, + "min_text_length": 72, + "average_text_length": 216.83349609375, + "max_text_length": 458, + "unique_texts": 322, + "min_labels_per_text": 2, + "average_labels_per_text": 1.0, + "max_labels_per_text": 237, + "unique_labels": 47, + "labels": { + "6": { + "count": 96 + }, + "11": { + "count": 150 + }, + "10": { + "count": 189 + }, + "25": { + "count": 132 + }, + "14": { + "count": 26 + }, + "27": { + "count": 101 + }, + "34": { + "count": 25 + }, + "29": { + "count": 111 + }, + "28": { + "count": 141 + }, + "17": { + "count": 51 + }, + "33": { + "count": 54 + }, + "24": { + "count": 12 + }, + "12": { + "count": 132 + }, + "42": { + "count": 237 + }, + "0": { + "count": 33 + }, + "30": { + "count": 64 + }, + "35": { + "count": 23 + }, + "3": { + "count": 49 + }, + "44": { + "count": 9 + }, + "4": { + "count": 16 + }, + "23": { + "count": 7 + }, + "16": { + "count": 37 + }, + "8": { + "count": 26 + }, + "38": { + "count": 36 + }, + "1": { + "count": 21 + }, + "46": { + "count": 14 + }, + "2": { + "count": 15 + }, + "45": { + "count": 16 + }, + "7": { + "count": 27 + }, + "9": { + "count": 12 + }, + "5": { + "count": 20 + }, + "31": { + "count": 21 + }, + "13": { + "count": 9 + }, + "43": { + "count": 16 + }, + "36": { + "count": 7 + }, + "32": { + "count": 41 + }, + "26": { + "count": 15 + }, + "21": { + "count": 10 + }, + "22": { + "count": 12 + }, + "20": { + "count": 15 + }, + "19": { + "count": 2 + }, + "18": { + "count": 2 + }, + "39": { + "count": 2 + }, + "40": { + "count": 2 + }, + "15": { + "count": 5 + }, + "37": { + "count": 5 + }, + "41": { + "count": 2 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/NLPTwitterAnalysisClustering.json b/mteb/descriptive_stats/Clustering/NLPTwitterAnalysisClustering.json new file mode 100644 index 0000000000..d6dd66b772 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/NLPTwitterAnalysisClustering.json @@ -0,0 +1,94 @@ +{ + "test": { + "num_samples": 1360, + "number_of_characters": 171909, + "min_text_length": 2, + "average_text_length": 126.40367647058824, + "max_text_length": 4523, + "unique_texts": 269, + "min_labels_per_text": 50, + "average_labels_per_text": 1.0, + "max_labels_per_text": 101, + "unique_labels": 26, + "labels": { + "25": { + "count": 50 + }, + "13": { + "count": 50 + }, + "1": { + "count": 50 + }, + "9": { + "count": 50 + }, + "16": { + "count": 101 + }, + "4": { + "count": 50 + }, + "8": { + "count": 50 + }, + "7": { + "count": 50 + }, + "24": { + "count": 50 + }, + "11": { + "count": 50 + }, + "20": { + "count": 50 + }, + "0": { + "count": 50 + }, + "21": { + "count": 55 + }, + "10": { + "count": 50 + }, + "14": { + "count": 50 + }, + "18": { + "count": 50 + }, + "17": { + "count": 50 + }, + "22": { + "count": 50 + }, + "23": { + "count": 50 + }, + "12": { + "count": 50 + }, + "2": { + "count": 50 + }, + "19": { + "count": 50 + }, + "15": { + "count": 50 + }, + "5": { + "count": 54 + }, + "3": { + "count": 50 + }, + "6": { + "count": 50 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/SIDClustring.json b/mteb/descriptive_stats/Clustering/SIDClustring.json new file mode 100644 index 0000000000..dcb17cb639 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/SIDClustring.json @@ -0,0 +1,40 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 2645730, + "min_text_length": 187, + "average_text_length": 1291.8603515625, + "max_text_length": 5144, + "unique_texts": 1090, + "min_labels_per_text": 31, + "average_labels_per_text": 1.0, + "max_labels_per_text": 790, + "unique_labels": 8, + "labels": { + "1": { + "count": 354 + }, + "6": { + "count": 31 + }, + "3": { + "count": 211 + }, + "7": { + "count": 105 + }, + "4": { + "count": 790 + }, + "0": { + "count": 87 + }, + "2": { + "count": 206 + }, + "5": { + "count": 264 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/WikipediaChemistryTopicsClustering.json b/mteb/descriptive_stats/Clustering/WikipediaChemistryTopicsClustering.json new file mode 100644 index 0000000000..5c6ce7799e --- /dev/null +++ b/mteb/descriptive_stats/Clustering/WikipediaChemistryTopicsClustering.json @@ -0,0 +1,43 @@ +{ + "test": { + "num_samples": 1, + "number_of_characters": 2105, + "min_text_length": 2105, + "average_text_length": 2105.0, + "max_text_length": 2105, + "unique_texts": 2105, + "min_labels_per_text": 142, + "average_labels_per_text": 2105.0, + "max_labels_per_text": 335, + "unique_labels": 9, + "labels": { + "Luminescence": { + "count": 217 + }, + "Cryobiology": { + "count": 281 + }, + "Tissue Engineering": { + "count": 335 + }, + "Nuclear Fusion": { + "count": 247 + }, + "Acids + Bases": { + "count": 142 + }, + "Separation Processes": { + "count": 255 + }, + "Carbohydrates": { + "count": 202 + }, + "Magnetic Ordering": { + "count": 224 + }, + "Ultraviolet Radiation": { + "count": 202 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Clustering/WikipediaSpecialtiesInChemistryClustering.json b/mteb/descriptive_stats/Clustering/WikipediaSpecialtiesInChemistryClustering.json new file mode 100644 index 0000000000..9c68264eb2 --- /dev/null +++ b/mteb/descriptive_stats/Clustering/WikipediaSpecialtiesInChemistryClustering.json @@ -0,0 +1,28 @@ +{ + "test": { + "num_samples": 1, + "number_of_characters": 617, + "min_text_length": 617, + "average_text_length": 617.0, + "max_text_length": 617, + "unique_texts": 617, + "min_labels_per_text": 113, + "average_labels_per_text": 617.0, + "max_labels_per_text": 229, + "unique_labels": 4, + "labels": { + "Biochemists": { + "count": 153 + }, + "Geochemists": { + "count": 113 + }, + "Electrochemists": { + "count": 122 + }, + "Atmospheric Chemists": { + "count": 229 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/CExaPPC.json b/mteb/descriptive_stats/PairClassification/CExaPPC.json new file mode 100644 index 0000000000..e6e5c4e595 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/CExaPPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 13504, + "number_of_characters": 3053248, + "unique_pairs": 13504, + "min_sentence1_length": 3, + "avg_sentence1_length": 121.67913210900474, + "max_sentence1_length": 21748, + "unique_sentence1": 9172, + "min_sentence2_length": 4, + "avg_sentence2_length": 104.42039395734596, + "max_sentence2_length": 470, + "unique_sentence2": 8969, + "unique_labels": 2, + "labels": { + "0": { + "count": 8182 + }, + "1": { + "count": 5322 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/FarsiParaphraseDetection.json b/mteb/descriptive_stats/PairClassification/FarsiParaphraseDetection.json new file mode 100644 index 0000000000..54361e807f --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/FarsiParaphraseDetection.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 783, + "number_of_characters": 46743, + "unique_pairs": 783, + "min_sentence1_length": 7, + "avg_sentence1_length": 29.513409961685824, + "max_sentence1_length": 140, + "unique_sentence1": 708, + "min_sentence2_length": 8, + "avg_sentence2_length": 30.183908045977013, + "max_sentence2_length": 147, + "unique_sentence2": 716, + "unique_labels": 2, + "labels": { + "0": { + "count": 448 + }, + "1": { + "count": 335 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/ParsinluEntail.json b/mteb/descriptive_stats/PairClassification/ParsinluEntail.json new file mode 100644 index 0000000000..2a6afb2f7f --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/ParsinluEntail.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 1173, + "number_of_characters": 211756, + "unique_pairs": 1158, + "min_sentence1_length": 2, + "avg_sentence1_length": 122.6231884057971, + "max_sentence1_length": 667, + "unique_sentence1": 1118, + "min_sentence2_length": 2, + "avg_sentence2_length": 57.90196078431372, + "max_sentence2_length": 266, + "unique_sentence2": 1156, + "unique_labels": 2, + "labels": { + "0": { + "count": 563 + }, + "1": { + "count": 610 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/ParsinluQueryParaphPC.json b/mteb/descriptive_stats/PairClassification/ParsinluQueryParaphPC.json new file mode 100644 index 0000000000..426a08f7b5 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/ParsinluQueryParaphPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 1916, + "number_of_characters": 162524, + "unique_pairs": 1914, + "min_sentence1_length": 9, + "avg_sentence1_length": 42.135699373695196, + "max_sentence1_length": 186, + "unique_sentence1": 813, + "min_sentence2_length": 9, + "avg_sentence2_length": 42.68893528183716, + "max_sentence2_length": 293, + "unique_sentence2": 1900, + "unique_labels": 2, + "labels": { + "0": { + "count": 1082 + }, + "1": { + "count": 834 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PubChemAISentenceParaphrasePC.json b/mteb/descriptive_stats/PairClassification/PubChemAISentenceParaphrasePC.json new file mode 100644 index 0000000000..6933a25736 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/PubChemAISentenceParaphrasePC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 348246, + "unique_pairs": 2042, + "min_sentence1_length": 9, + "avg_sentence1_length": 84.4833984375, + "max_sentence1_length": 420, + "unique_sentence1": 1885, + "min_sentence2_length": 13, + "avg_sentence2_length": 85.55859375, + "max_sentence2_length": 433, + "unique_sentence2": 1895, + "unique_labels": 2, + "labels": { + "0": { + "count": 1843 + }, + "1": { + "count": 205 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PubChemSMILESPC.json b/mteb/descriptive_stats/PairClassification/PubChemSMILESPC.json new file mode 100644 index 0000000000..f479495254 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/PubChemSMILESPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 293689, + "unique_pairs": 2048, + "min_sentence1_length": 5, + "avg_sentence1_length": 94.71630859375, + "max_sentence1_length": 743, + "unique_sentence1": 2048, + "min_sentence2_length": 19, + "avg_sentence2_length": 48.6865234375, + "max_sentence2_length": 100, + "unique_sentence2": 1957, + "unique_labels": 2, + "labels": { + "0": { + "count": 1844 + }, + "1": { + "count": 204 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PubChemSynonymPC.json b/mteb/descriptive_stats/PairClassification/PubChemSynonymPC.json new file mode 100644 index 0000000000..3ae3a7ab2c --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/PubChemSynonymPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 110931, + "unique_pairs": 2048, + "min_sentence1_length": 4, + "avg_sentence1_length": 18.8017578125, + "max_sentence1_length": 182, + "unique_sentence1": 2048, + "min_sentence2_length": 2, + "avg_sentence2_length": 35.36376953125, + "max_sentence2_length": 329, + "unique_sentence2": 2048, + "unique_labels": 2, + "labels": { + "1": { + "count": 204 + }, + "0": { + "count": 1844 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PubChemWikiPairClassification.json b/mteb/descriptive_stats/PairClassification/PubChemWikiPairClassification.json new file mode 100644 index 0000000000..116805a1e9 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/PubChemWikiPairClassification.json @@ -0,0 +1,290 @@ +{ + "test": { + "num_samples": 11023, + "number_of_characters": 8850978, + "unique_pairs": 11017, + "min_sentence1_length": 21, + "avg_sentence1_length": 394.76367595028574, + "max_sentence1_length": 5088, + "unique_sentence1": 5078, + "min_sentence2_length": 13, + "avg_sentence2_length": 408.1917808219178, + "max_sentence2_length": 6946, + "unique_sentence2": 9518, + "unique_labels": 2, + "labels": { + "0": { + "count": 9914 + }, + "1": { + "count": 1109 + } + }, + "hf_subset_descriptive_stats": { + "de": { + "num_samples": 1554, + "number_of_characters": 1135687, + "unique_pairs": 1554, + "min_sentence1_length": 26, + "avg_sentence1_length": 374.6756756756757, + "max_sentence1_length": 5088, + "unique_sentence1": 1483, + "min_sentence2_length": 34, + "avg_sentence2_length": 356.13963963963965, + "max_sentence2_length": 2441, + "unique_sentence2": 1357, + "unique_labels": 2, + "labels": { + "0": { + "count": 1398 + }, + "1": { + "count": 156 + } + } + }, + "nl": { + "num_samples": 905, + "number_of_characters": 849823, + "unique_pairs": 904, + "min_sentence1_length": 23, + "avg_sentence1_length": 411.2364640883978, + "max_sentence1_length": 5088, + "unique_sentence1": 869, + "min_sentence2_length": 33, + "avg_sentence2_length": 527.7944751381216, + "max_sentence2_length": 2291, + "unique_sentence2": 787, + "unique_labels": 2, + "labels": { + "1": { + "count": 91 + }, + "0": { + "count": 814 + } + } + }, + "zh": { + "num_samples": 1442, + "number_of_characters": 734124, + "unique_pairs": 1442, + "min_sentence1_length": 23, + "avg_sentence1_length": 373.0929264909847, + "max_sentence1_length": 5088, + "unique_sentence1": 1358, + "min_sentence2_length": 13, + "avg_sentence2_length": 136.00832177531206, + "max_sentence2_length": 889, + "unique_sentence2": 1274, + "unique_labels": 2, + "labels": { + "0": { + "count": 1297 + }, + "1": { + "count": 145 + } + } + }, + "fr": { + "num_samples": 1356, + "number_of_characters": 1243355, + "unique_pairs": 1353, + "min_sentence1_length": 23, + "avg_sentence1_length": 387.4092920353982, + "max_sentence1_length": 3478, + "unique_sentence1": 1293, + "min_sentence2_length": 30, + "avg_sentence2_length": 529.5191740412979, + "max_sentence2_length": 3516, + "unique_sentence2": 1159, + "unique_labels": 2, + "labels": { + "1": { + "count": 136 + }, + "0": { + "count": 1220 + } + } + }, + "es": { + "num_samples": 1231, + "number_of_characters": 1252739, + "unique_pairs": 1229, + "min_sentence1_length": 21, + "avg_sentence1_length": 391.6490658001625, + "max_sentence1_length": 3437, + "unique_sentence1": 1175, + "min_sentence2_length": 50, + "avg_sentence2_length": 626.0105605199025, + "max_sentence2_length": 6946, + "unique_sentence2": 1052, + "unique_labels": 2, + "labels": { + "0": { + "count": 1107 + }, + "1": { + "count": 124 + } + } + }, + "pt": { + "num_samples": 998, + "number_of_characters": 941109, + "unique_pairs": 998, + "min_sentence1_length": 25, + "avg_sentence1_length": 409.17835671342687, + "max_sentence1_length": 2524, + "unique_sentence1": 950, + "min_sentence2_length": 25, + "avg_sentence2_length": 533.816633266533, + "max_sentence2_length": 3630, + "unique_sentence2": 864, + "unique_labels": 2, + "labels": { + "0": { + "count": 898 + }, + "1": { + "count": 100 + } + } + }, + "ms": { + "num_samples": 192, + "number_of_characters": 227803, + "unique_pairs": 192, + "min_sentence1_length": 30, + "avg_sentence1_length": 477.640625, + "max_sentence1_length": 3161, + "unique_sentence1": 186, + "min_sentence2_length": 135, + "avg_sentence2_length": 708.8333333333334, + "max_sentence2_length": 3820, + "unique_sentence2": 160, + "unique_labels": 2, + "labels": { + "0": { + "count": 172 + }, + "1": { + "count": 20 + } + } + }, + "ko": { + "num_samples": 554, + "number_of_characters": 415390, + "unique_pairs": 554, + "min_sentence1_length": 24, + "avg_sentence1_length": 444.1750902527076, + "max_sentence1_length": 3547, + "unique_sentence1": 526, + "min_sentence2_length": 27, + "avg_sentence2_length": 305.62635379061373, + "max_sentence2_length": 2550, + "unique_sentence2": 470, + "unique_labels": 2, + "labels": { + "0": { + "count": 498 + }, + "1": { + "count": 56 + } + } + }, + "tr": { + "num_samples": 518, + "number_of_characters": 582769, + "unique_pairs": 518, + "min_sentence1_length": 23, + "avg_sentence1_length": 491.6988416988417, + "max_sentence1_length": 4658, + "unique_sentence1": 493, + "min_sentence2_length": 54, + "avg_sentence2_length": 633.3378378378378, + "max_sentence2_length": 3949, + "unique_sentence2": 452, + "unique_labels": 2, + "labels": { + "0": { + "count": 466 + }, + "1": { + "count": 52 + } + } + }, + "hi": { + "num_samples": 228, + "number_of_characters": 168480, + "unique_pairs": 228, + "min_sentence1_length": 27, + "avg_sentence1_length": 405.859649122807, + "max_sentence1_length": 4236, + "unique_sentence1": 217, + "min_sentence2_length": 17, + "avg_sentence2_length": 333.0877192982456, + "max_sentence2_length": 4760, + "unique_sentence2": 188, + "unique_labels": 2, + "labels": { + "0": { + "count": 205 + }, + "1": { + "count": 23 + } + } + }, + "cs": { + "num_samples": 611, + "number_of_characters": 520475, + "unique_pairs": 611, + "min_sentence1_length": 24, + "avg_sentence1_length": 387.6317512274959, + "max_sentence1_length": 3760, + "unique_sentence1": 579, + "min_sentence2_length": 55, + "avg_sentence2_length": 464.20949263502456, + "max_sentence2_length": 2351, + "unique_sentence2": 521, + "unique_labels": 2, + "labels": { + "0": { + "count": 549 + }, + "1": { + "count": 62 + } + } + }, + "ja": { + "num_samples": 1434, + "number_of_characters": 779224, + "unique_pairs": 1434, + "min_sentence1_length": 21, + "avg_sentence1_length": 363.5976290097629, + "max_sentence1_length": 2160, + "unique_sentence1": 1369, + "min_sentence2_length": 29, + "avg_sentence2_length": 179.79428172942818, + "max_sentence2_length": 1072, + "unique_sentence2": 1234, + "unique_labels": 2, + "labels": { + "0": { + "count": 1290 + }, + "1": { + "count": 144 + } + } + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/PubChemWikiParagraphsPC.json b/mteb/descriptive_stats/PairClassification/PubChemWikiParagraphsPC.json new file mode 100644 index 0000000000..91e90373c9 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/PubChemWikiParagraphsPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 2048, + "number_of_characters": 1374316, + "unique_pairs": 2048, + "min_sentence1_length": 15, + "avg_sentence1_length": 337.8017578125, + "max_sentence1_length": 4990, + "unique_sentence1": 1677, + "min_sentence2_length": 14, + "avg_sentence2_length": 333.2509765625, + "max_sentence2_length": 5046, + "unique_sentence2": 1698, + "unique_labels": 2, + "labels": { + "0": { + "count": 1843 + }, + "1": { + "count": 205 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/SynPerChatbotRAGFAQPC.json b/mteb/descriptive_stats/PairClassification/SynPerChatbotRAGFAQPC.json new file mode 100644 index 0000000000..125025f899 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/SynPerChatbotRAGFAQPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 2174, + "number_of_characters": 1682951, + "unique_pairs": 2174, + "min_sentence1_length": 37, + "avg_sentence1_length": 628.5234590616375, + "max_sentence1_length": 2601, + "unique_sentence1": 1087, + "min_sentence2_length": 61, + "avg_sentence2_length": 145.60303587856487, + "max_sentence2_length": 297, + "unique_sentence2": 2174, + "unique_labels": 2, + "labels": { + "1": { + "count": 1087 + }, + "0": { + "count": 1087 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/SynPerQAPC.json b/mteb/descriptive_stats/PairClassification/SynPerQAPC.json new file mode 100644 index 0000000000..68183e6049 --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/SynPerQAPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 55568, + "number_of_characters": 20690473, + "unique_pairs": 55402, + "min_sentence1_length": 10, + "avg_sentence1_length": 59.132126403685575, + "max_sentence1_length": 172, + "unique_sentence1": 27243, + "min_sentence2_length": 6, + "avg_sentence2_length": 313.2129822919666, + "max_sentence2_length": 866, + "unique_sentence2": 50194, + "unique_labels": 2, + "labels": { + "1": { + "count": 27784 + }, + "0": { + "count": 27784 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/PairClassification/SynPerTextKeywordsPC.json b/mteb/descriptive_stats/PairClassification/SynPerTextKeywordsPC.json new file mode 100644 index 0000000000..a0ca04235d --- /dev/null +++ b/mteb/descriptive_stats/PairClassification/SynPerTextKeywordsPC.json @@ -0,0 +1,24 @@ +{ + "test": { + "num_samples": 5856, + "number_of_characters": 1554203, + "unique_pairs": 5852, + "min_sentence1_length": 60, + "avg_sentence1_length": 221.99521857923497, + "max_sentence1_length": 399, + "unique_sentence1": 2923, + "min_sentence2_length": 9, + "avg_sentence2_length": 43.408299180327866, + "max_sentence2_length": 139, + "unique_sentence2": 5847, + "unique_labels": 2, + "labels": { + "1": { + "count": 2928 + }, + "0": { + "count": 2928 + } + } + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Reranking/CodeRAGLibraryDocumentationSolutions.json b/mteb/descriptive_stats/Reranking/CodeRAGLibraryDocumentationSolutions.json new file mode 100644 index 0000000000..beb455ed47 --- /dev/null +++ b/mteb/descriptive_stats/Reranking/CodeRAGLibraryDocumentationSolutions.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 61198, + "number_of_characters": 57181324, + "num_documents": 30599, + "min_document_length": 5, + "average_document_length": 1786.6975064544592, + "max_document_length": 385432, + "unique_documents": 30599, + "num_queries": 30599, + "min_query_length": 2, + "average_query_length": 82.03428216608386, + "max_query_length": 43706, + "unique_queries": 30599, + "none_queries": 0, + "num_relevant_docs": 30599, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 30599, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Reranking/CodeRAGOnlineTutorials.json b/mteb/descriptive_stats/Reranking/CodeRAGOnlineTutorials.json new file mode 100644 index 0000000000..f99e2b1858 --- /dev/null +++ b/mteb/descriptive_stats/Reranking/CodeRAGOnlineTutorials.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 153286, + "number_of_characters": 432903239, + "num_documents": 76643, + "min_document_length": 17, + "average_document_length": 5594.971308534374, + "max_document_length": 477365, + "unique_documents": 76643, + "num_queries": 76643, + "min_query_length": 3, + "average_query_length": 53.33628641885104, + "max_query_length": 221, + "unique_queries": 76643, + "none_queries": 0, + "num_relevant_docs": 76643, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 76643, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Reranking/CodeRAGProgrammingSolutions.json b/mteb/descriptive_stats/Reranking/CodeRAGProgrammingSolutions.json new file mode 100644 index 0000000000..5f5491ca9f --- /dev/null +++ b/mteb/descriptive_stats/Reranking/CodeRAGProgrammingSolutions.json @@ -0,0 +1,31 @@ +{ + "train": { + "num_samples": 2114, + "number_of_characters": 355909, + "num_documents": 1128, + "min_document_length": 30, + "average_document_length": 246.27304964539007, + "max_document_length": 1993, + "unique_documents": 1128, + "num_queries": 986, + "min_query_length": 11, + "average_query_length": 79.22210953346855, + "max_query_length": 251, + "unique_queries": 1128, + "none_queries": 142, + "num_relevant_docs": 1128, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1128, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/ArguAna-Fa.json b/mteb/descriptive_stats/Retrieval/ArguAna-Fa.json new file mode 100644 index 0000000000..0d9c8ef6ff --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/ArguAna-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 10080, + "number_of_characters": 9458841, + "num_documents": 8674, + "min_document_length": 1, + "average_document_length": 918.7068249942356, + "max_document_length": 4427, + "unique_documents": 8674, + "num_queries": 1406, + "min_query_length": 189, + "average_query_length": 1059.7283072546231, + "max_query_length": 4234, + "unique_queries": 1406, + "none_queries": 0, + "num_relevant_docs": 1406, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 1406, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackAndroidRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackAndroidRetrieval-Fa.json new file mode 100644 index 0000000000..3397740bca --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackAndroidRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 23697, + "number_of_characters": 13269812, + "num_documents": 22998, + "min_document_length": 11, + "average_document_length": 575.3561614053395, + "max_document_length": 5081, + "unique_documents": 22998, + "num_queries": 699, + "min_query_length": 14, + "average_query_length": 54.035765379113016, + "max_query_length": 155, + "unique_queries": 699, + "none_queries": 0, + "num_relevant_docs": 1696, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.4263233190271816, + "max_relevant_docs_per_query": 262, + "unique_relevant_docs": 1696, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackEnglishRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackEnglishRetrieval-Fa.json new file mode 100644 index 0000000000..8f4ecefe4e --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackEnglishRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 41782, + "number_of_characters": 17581227, + "num_documents": 40221, + "min_document_length": 13, + "average_document_length": 435.477760373934, + "max_document_length": 4694, + "unique_documents": 40221, + "num_queries": 1561, + "min_query_length": 7, + "average_query_length": 42.2011531069827, + "max_query_length": 153, + "unique_queries": 1570, + "none_queries": 9, + "num_relevant_docs": 3765, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.3980891719745223, + "max_relevant_docs_per_query": 79, + "unique_relevant_docs": 3765, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackGamingRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackGamingRetrieval-Fa.json new file mode 100644 index 0000000000..990929a5e6 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackGamingRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 46896, + "number_of_characters": 21021720, + "num_documents": 45301, + "min_document_length": 0, + "average_document_length": 462.30999315688393, + "max_document_length": 5065, + "unique_documents": 45301, + "num_queries": 1595, + "min_query_length": 7, + "average_query_length": 49.288401253918494, + "max_query_length": 132, + "unique_queries": 1595, + "none_queries": 0, + "num_relevant_docs": 2263, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.418808777429467, + "max_relevant_docs_per_query": 30, + "unique_relevant_docs": 2263, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackGisRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackGisRetrieval-Fa.json new file mode 100644 index 0000000000..7369b68c64 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackGisRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 38522, + "number_of_characters": 32961978, + "num_documents": 37637, + "min_document_length": 17, + "average_document_length": 874.5469351967479, + "max_document_length": 5738, + "unique_documents": 37637, + "num_queries": 885, + "min_query_length": 12, + "average_query_length": 52.717514124293785, + "max_query_length": 142, + "unique_queries": 885, + "none_queries": 0, + "num_relevant_docs": 1114, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.2587570621468926, + "max_relevant_docs_per_query": 22, + "unique_relevant_docs": 1114, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackMathematicaRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackMathematicaRetrieval-Fa.json new file mode 100644 index 0000000000..d3aae85631 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackMathematicaRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 17503, + "number_of_characters": 16590623, + "num_documents": 16705, + "min_document_length": 11, + "average_document_length": 990.9334331038611, + "max_document_length": 5280, + "unique_documents": 16705, + "num_queries": 798, + "min_query_length": 7, + "average_query_length": 46.46616541353384, + "max_query_length": 142, + "unique_queries": 804, + "none_queries": 6, + "num_relevant_docs": 1358, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.6890547263681592, + "max_relevant_docs_per_query": 56, + "unique_relevant_docs": 1358, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackPhysicsRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackPhysicsRetrieval-Fa.json new file mode 100644 index 0000000000..39bdf14c6f --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackPhysicsRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 39347, + "number_of_characters": 27772886, + "num_documents": 38316, + "min_document_length": 0, + "average_document_length": 723.60877962209, + "max_document_length": 5053, + "unique_documents": 38316, + "num_queries": 1031, + "min_query_length": 9, + "average_query_length": 45.67604267701261, + "max_query_length": 158, + "unique_queries": 1039, + "none_queries": 8, + "num_relevant_docs": 1933, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.8604427333974976, + "max_relevant_docs_per_query": 72, + "unique_relevant_docs": 1933, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackProgrammersRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackProgrammersRetrieval-Fa.json new file mode 100644 index 0000000000..850ef8d7d7 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackProgrammersRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 33052, + "number_of_characters": 31246627, + "num_documents": 32176, + "min_document_length": 13, + "average_document_length": 969.6830246146196, + "max_document_length": 5175, + "unique_documents": 32176, + "num_queries": 876, + "min_query_length": 11, + "average_query_length": 52.6324200913242, + "max_query_length": 132, + "unique_queries": 876, + "none_queries": 0, + "num_relevant_docs": 1675, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.9121004566210045, + "max_relevant_docs_per_query": 149, + "unique_relevant_docs": 1675, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackStatsRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackStatsRetrieval-Fa.json new file mode 100644 index 0000000000..e262dbc1a0 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackStatsRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 42921, + "number_of_characters": 37808958, + "num_documents": 42269, + "min_document_length": 0, + "average_document_length": 893.7330904445338, + "max_document_length": 7532, + "unique_documents": 42269, + "num_queries": 652, + "min_query_length": 11, + "average_query_length": 48.70245398773006, + "max_query_length": 133, + "unique_queries": 652, + "none_queries": 0, + "num_relevant_docs": 913, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.4003067484662577, + "max_relevant_docs_per_query": 18, + "unique_relevant_docs": 913, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackTexRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackTexRetrieval-Fa.json new file mode 100644 index 0000000000..af704dbdf2 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackTexRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 71082, + "number_of_characters": 70999512, + "num_documents": 68184, + "min_document_length": 0, + "average_document_length": 1039.4095828933473, + "max_document_length": 6283, + "unique_documents": 68184, + "num_queries": 2898, + "min_query_length": 8, + "average_query_length": 44.30952380952381, + "max_query_length": 129, + "unique_queries": 2906, + "none_queries": 8, + "num_relevant_docs": 5154, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.7735719201651754, + "max_relevant_docs_per_query": 146, + "unique_relevant_docs": 5154, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackUnixRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackUnixRetrieval-Fa.json new file mode 100644 index 0000000000..4b53c787a0 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackUnixRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 48451, + "number_of_characters": 39587389, + "num_documents": 47382, + "min_document_length": 13, + "average_document_length": 834.364125617323, + "max_document_length": 5124, + "unique_documents": 47382, + "num_queries": 1069, + "min_query_length": 10, + "average_query_length": 50.09167446211413, + "max_query_length": 141, + "unique_queries": 1072, + "none_queries": 3, + "num_relevant_docs": 1693, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.5792910447761195, + "max_relevant_docs_per_query": 22, + "unique_relevant_docs": 1693, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackWebmastersRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackWebmastersRetrieval-Fa.json new file mode 100644 index 0000000000..e660f63592 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackWebmastersRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 17907, + "number_of_characters": 11695225, + "num_documents": 17405, + "min_document_length": 14, + "average_document_length": 670.4368859523125, + "max_document_length": 5076, + "unique_documents": 17405, + "num_queries": 502, + "min_query_length": 14, + "average_query_length": 52.332669322709165, + "max_query_length": 138, + "unique_queries": 506, + "none_queries": 4, + "num_relevant_docs": 1395, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.7569169960474307, + "max_relevant_docs_per_query": 207, + "unique_relevant_docs": 1395, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/CQADupstackWordpressRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/CQADupstackWordpressRetrieval-Fa.json new file mode 100644 index 0000000000..ac11341c52 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CQADupstackWordpressRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 49143, + "number_of_characters": 45959969, + "num_documents": 48605, + "min_document_length": 11, + "average_document_length": 945.0305112642732, + "max_document_length": 5089, + "unique_documents": 48605, + "num_queries": 538, + "min_query_length": 14, + "average_query_length": 49.74163568773234, + "max_query_length": 118, + "unique_queries": 541, + "none_queries": 3, + "num_relevant_docs": 744, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.3752310536044363, + "max_relevant_docs_per_query": 62, + "unique_relevant_docs": 744, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/ChemHotpotQARetrieval.json b/mteb/descriptive_stats/Retrieval/ChemHotpotQARetrieval.json new file mode 100644 index 0000000000..cc0f180b29 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/ChemHotpotQARetrieval.json @@ -0,0 +1,89 @@ +{ + "train": { + "num_samples": 10249, + "number_of_characters": 4073355, + "num_documents": 10069, + "min_document_length": 36, + "average_document_length": 402.44651901877046, + "max_document_length": 2661, + "unique_documents": 10069, + "num_queries": 180, + "min_query_length": 32, + "average_query_length": 117.33888888888889, + "max_query_length": 481, + "unique_queries": 180, + "none_queries": 0, + "num_relevant_docs": 187, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.038888888888889, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 123, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "dev": { + "num_samples": 10077, + "number_of_characters": 4053098, + "num_documents": 10069, + "min_document_length": 36, + "average_document_length": 402.44651901877046, + "max_document_length": 2661, + "unique_documents": 10069, + "num_queries": 8, + "min_query_length": 61, + "average_query_length": 108.0, + "max_query_length": 146, + "unique_queries": 8, + "none_queries": 0, + "num_relevant_docs": 8, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 8, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "test": { + "num_samples": 10087, + "number_of_characters": 4054110, + "num_documents": 10069, + "min_document_length": 36, + "average_document_length": 402.44651901877046, + "max_document_length": 2661, + "unique_documents": 10069, + "num_queries": 18, + "min_query_length": 46, + "average_query_length": 104.22222222222223, + "max_query_length": 179, + "unique_queries": 18, + "none_queries": 0, + "num_relevant_docs": 18, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 18, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/ChemNQRetrieval.json b/mteb/descriptive_stats/Retrieval/ChemNQRetrieval.json new file mode 100644 index 0000000000..c692d3cc95 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/ChemNQRetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 22960, + "number_of_characters": 10651219, + "num_documents": 22933, + "min_document_length": 10, + "average_document_length": 464.3858631666158, + "max_document_length": 2801, + "unique_documents": 22933, + "num_queries": 27, + "min_query_length": 33, + "average_query_length": 54.0, + "max_query_length": 87, + "unique_queries": 27, + "none_queries": 0, + "num_relevant_docs": 35, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.2962962962962963, + "max_relevant_docs_per_query": 3, + "unique_relevant_docs": 35, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/FiQA2018-Fa.json b/mteb/descriptive_stats/Retrieval/FiQA2018-Fa.json new file mode 100644 index 0000000000..33d6143e1d --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/FiQA2018-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 58285, + "number_of_characters": 39874046, + "num_documents": 57638, + "min_document_length": 0, + "average_document_length": 691.1134321107603, + "max_document_length": 5267, + "unique_documents": 57638, + "num_queries": 647, + "min_query_length": 14, + "average_query_length": 61.282843894899536, + "max_query_length": 164, + "unique_queries": 648, + "none_queries": 1, + "num_relevant_docs": 1706, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 2.632716049382716, + "max_relevant_docs_per_query": 15, + "unique_relevant_docs": 1706, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NFCorpus-Fa.json b/mteb/descriptive_stats/Retrieval/NFCorpus-Fa.json new file mode 100644 index 0000000000..cf3ac61902 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NFCorpus-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 3956, + "number_of_characters": 5048794, + "num_documents": 3633, + "min_document_length": 107, + "average_document_length": 1387.89843104872, + "max_document_length": 4772, + "unique_documents": 3633, + "num_queries": 323, + "min_query_length": 3, + "average_query_length": 20.306501547987615, + "max_query_length": 83, + "unique_queries": 323, + "none_queries": 0, + "num_relevant_docs": 12334, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 38.18575851393189, + "max_relevant_docs_per_query": 475, + "unique_relevant_docs": 3128, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/PersianWebDocumentRetrieval.json b/mteb/descriptive_stats/Retrieval/PersianWebDocumentRetrieval.json new file mode 100644 index 0000000000..65bee298db --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/PersianWebDocumentRetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 167770, + "number_of_characters": 41085548, + "num_documents": 166819, + "min_document_length": 0, + "average_document_length": 246.1896966172918, + "max_document_length": 74016, + "unique_documents": 166819, + "num_queries": 951, + "min_query_length": 3, + "average_query_length": 17.275499474237645, + "max_query_length": 59, + "unique_queries": 951, + "none_queries": 0, + "num_relevant_docs": 18308, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 8.963196635120925, + "max_relevant_docs_per_query": 46, + "unique_relevant_docs": 17974, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/QuoraRetrieval-Fa.json b/mteb/descriptive_stats/Retrieval/QuoraRetrieval-Fa.json new file mode 100644 index 0000000000..5b03eeeb5a --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/QuoraRetrieval-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 532928, + "number_of_characters": 31039313, + "num_documents": 522931, + "min_document_length": 0, + "average_document_length": 58.43793540639205, + "max_document_length": 1172, + "unique_documents": 522931, + "num_queries": 9997, + "min_query_length": 2, + "average_query_length": 48.04491347404221, + "max_query_length": 202, + "unique_queries": 10000, + "none_queries": 3, + "num_relevant_docs": 15675, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.5675, + "max_relevant_docs_per_query": 75, + "unique_relevant_docs": 15675, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SCIDOCS-Fa.json b/mteb/descriptive_stats/Retrieval/SCIDOCS-Fa.json new file mode 100644 index 0000000000..eec9b63f3d --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SCIDOCS-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 26657, + "number_of_characters": 26828338, + "num_documents": 25657, + "min_document_length": 0, + "average_document_length": 1043.2618388743813, + "max_document_length": 5051, + "unique_documents": 25657, + "num_queries": 1000, + "min_query_length": 15, + "average_query_length": 61.369, + "max_query_length": 206, + "unique_queries": 1000, + "none_queries": 0, + "num_relevant_docs": 29928, + "min_relevant_docs_per_query": 27, + "average_relevant_docs_per_query": 4.928, + "max_relevant_docs_per_query": 30, + "unique_relevant_docs": 25657, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SciFact-Fa.json b/mteb/descriptive_stats/Retrieval/SciFact-Fa.json new file mode 100644 index 0000000000..42188525cd --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SciFact-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 5483, + "number_of_characters": 6846379, + "num_documents": 5183, + "min_document_length": 170, + "average_document_length": 1316.248504726992, + "max_document_length": 4797, + "unique_documents": 5183, + "num_queries": 300, + "min_query_length": 24, + "average_query_length": 80.87666666666667, + "max_query_length": 207, + "unique_queries": 300, + "none_queries": 0, + "num_relevant_docs": 339, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.13, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 283, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGFAQRetrieval.json b/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGFAQRetrieval.json new file mode 100644 index 0000000000..a5136be93a --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGFAQRetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 9783, + "number_of_characters": 1950156, + "num_documents": 8696, + "min_document_length": 56, + "average_document_length": 145.69353725850965, + "max_document_length": 342, + "unique_documents": 8696, + "num_queries": 1087, + "min_query_length": 37, + "average_query_length": 628.5234590616375, + "max_query_length": 2601, + "unique_queries": 1087, + "none_queries": 0, + "num_relevant_docs": 4348, + "min_relevant_docs_per_query": 4, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 3322, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGTopicsRetrieval.json b/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGTopicsRetrieval.json new file mode 100644 index 0000000000..c44b1b36c7 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SynPerChatbotRAGTopicsRetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 5233, + "number_of_characters": 733909, + "num_documents": 4146, + "min_document_length": 2, + "average_document_length": 12.22961890979257, + "max_document_length": 35, + "unique_documents": 4146, + "num_queries": 1087, + "min_query_length": 37, + "average_query_length": 628.5234590616375, + "max_query_length": 2601, + "unique_queries": 1087, + "none_queries": 0, + "num_relevant_docs": 14008, + "min_relevant_docs_per_query": 8, + "average_relevant_docs_per_query": 3.2217111315547378, + "max_relevant_docs_per_query": 24, + "unique_relevant_docs": 3083, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SynPerChatbotTopicsRetrieval.json b/mteb/descriptive_stats/Retrieval/SynPerChatbotTopicsRetrieval.json new file mode 100644 index 0000000000..8cb02de504 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SynPerChatbotTopicsRetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 7753, + "number_of_characters": 1536226, + "num_documents": 6216, + "min_document_length": 2, + "average_document_length": 12.305662805662806, + "max_document_length": 38, + "unique_documents": 6216, + "num_queries": 1537, + "min_query_length": 383, + "average_query_length": 949.729342875732, + "max_query_length": 1828, + "unique_queries": 1537, + "none_queries": 0, + "num_relevant_docs": 19648, + "min_relevant_docs_per_query": 8, + "average_relevant_docs_per_query": 3.19583604424203, + "max_relevant_docs_per_query": 24, + "unique_relevant_docs": 4101, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/SynPerQARetrieval.json b/mteb/descriptive_stats/Retrieval/SynPerQARetrieval.json new file mode 100644 index 0000000000..e159d4feed --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/SynPerQARetrieval.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 297885, + "number_of_characters": 83903015, + "num_documents": 270642, + "min_document_length": 3, + "average_document_length": 304.04882095166306, + "max_document_length": 1214, + "unique_documents": 270642, + "num_queries": 27243, + "min_query_length": 10, + "average_query_length": 59.26784862166428, + "max_query_length": 172, + "unique_queries": 27243, + "none_queries": 0, + "num_relevant_docs": 111136, + "min_relevant_docs_per_query": 4, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 85385, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/TRECCOVID-Fa.json b/mteb/descriptive_stats/Retrieval/TRECCOVID-Fa.json new file mode 100644 index 0000000000..972562b429 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/TRECCOVID-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 171382, + "number_of_characters": 165530102, + "num_documents": 171332, + "min_document_length": 0, + "average_document_length": 966.1181857446361, + "max_document_length": 5177, + "unique_documents": 171332, + "num_queries": 50, + "min_query_length": 19, + "average_query_length": 62.82, + "max_query_length": 179, + "unique_queries": 50, + "none_queries": 0, + "num_relevant_docs": 66336, + "min_relevant_docs_per_query": 631, + "average_relevant_docs_per_query": 493.5, + "max_relevant_docs_per_query": 1941, + "unique_relevant_docs": 35480, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/STS/Farsick.json b/mteb/descriptive_stats/STS/Farsick.json new file mode 100644 index 0000000000..965eed3fe1 --- /dev/null +++ b/mteb/descriptive_stats/STS/Farsick.json @@ -0,0 +1,18 @@ +{ + "test": { + "num_samples": 8566, + "number_of_characters": 761322, + "unique_pairs": 8493, + "min_sentence1_length": 11, + "average_sentence1_len": 44.69098762549615, + "max_sentence1_length": 140, + "unique_sentence1": 6778, + "min_sentence2_length": 11, + "average_sentence2_len": 44.186201260798505, + "max_sentence2_length": 153, + "unique_sentence2": 6716, + "min_score": 1.0, + "avg_score": 3.370764067242593, + "max_score": 5.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/STS/Query2Query.json b/mteb/descriptive_stats/STS/Query2Query.json new file mode 100644 index 0000000000..a1d40ed75c --- /dev/null +++ b/mteb/descriptive_stats/STS/Query2Query.json @@ -0,0 +1,18 @@ +{ + "test": { + "num_samples": 30001, + "number_of_characters": 1526887, + "unique_pairs": 24202, + "min_sentence1_length": 3, + "average_sentence1_len": 25.499783340555314, + "max_sentence1_length": 252, + "unique_sentence1": 18670, + "min_sentence2_length": 2, + "average_sentence2_len": 25.394753508216393, + "max_sentence2_length": 331, + "unique_sentence2": 21402, + "min_score": 0.0, + "avg_score": 0.11247824336221741, + "max_score": 1.0 + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/STS/SynPerSTS.json b/mteb/descriptive_stats/STS/SynPerSTS.json new file mode 100644 index 0000000000..13390e1b3f --- /dev/null +++ b/mteb/descriptive_stats/STS/SynPerSTS.json @@ -0,0 +1,18 @@ +{ + "test": { + "num_samples": 12385, + "number_of_characters": 3024219, + "unique_pairs": 12385, + "min_sentence1_length": 54, + "average_sentence1_len": 114.28058134840533, + "max_sentence1_length": 198, + "unique_sentence1": 2476, + "min_sentence2_length": 45, + "average_sentence2_len": 129.90343157044813, + "max_sentence2_length": 393, + "unique_sentence2": 11965, + "min_score": 1, + "avg_score": 3.0, + "max_score": 5 + } +} \ No newline at end of file diff --git a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py index b3e297e043..22a0106239 100644 --- a/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py +++ b/mteb/tasks/PairClassification/eng/PubChemSMILESPC.py @@ -111,7 +111,7 @@ def dataset_transform(self): self.dataset = self.stratified_subsampling( self.dataset, seed=self.seed, - splits=self.metadata["eval_splits"], + splits=self.metadata.eval_splits, label="labels", ) From b21eea18287477e7985532cdff2910ecf5f9f062 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 17:29:48 +0300 Subject: [PATCH 269/275] remove print from script generate_imports --- scripts/generate_imports.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/generate_imports.py b/scripts/generate_imports.py index 4961991034..ce637e048e 100644 --- a/scripts/generate_imports.py +++ b/scripts/generate_imports.py @@ -33,9 +33,6 @@ def find_task_classes_in_module(full_module_name): and not obj.__name__ == "MultilingualTask" ): if is_module_aggregated: - print( - "aggregated_tasks", obj.__name__, issubclass(obj, AbsTaskAggregate) - ) if issubclass(obj, AbsTaskAggregate): task_classes.append(name) continue From 845491e3edf1cc3009a715bc4c582498bb8dc409 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 18:48:19 +0300 Subject: [PATCH 270/275] add rest of metadata --- .../Retrieval/ClimateFEVER-Fa.json | 31 +++++++++++++++++++ .../Retrieval/DBPedia-Fa.json | 31 +++++++++++++++++++ .../Retrieval/HotpotQA-Fa.json | 31 +++++++++++++++++++ .../Retrieval/MSMARCO-Fa.json | 31 +++++++++++++++++++ mteb/descriptive_stats/Retrieval/NQ-Fa.json | 31 +++++++++++++++++++ .../Retrieval/Touche2020-Fa.json | 31 +++++++++++++++++++ tests/test_TaskMetadata.py | 6 +++- 7 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 mteb/descriptive_stats/Retrieval/ClimateFEVER-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/DBPedia-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/HotpotQA-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/NQ-Fa.json create mode 100644 mteb/descriptive_stats/Retrieval/Touche2020-Fa.json diff --git a/mteb/descriptive_stats/Retrieval/ClimateFEVER-Fa.json b/mteb/descriptive_stats/Retrieval/ClimateFEVER-Fa.json new file mode 100644 index 0000000000..a79397630b --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/ClimateFEVER-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 5418128, + "number_of_characters": 2529680839, + "num_documents": 5416593, + "min_document_length": 2, + "average_document_length": 466.99207139986333, + "max_document_length": 5259, + "unique_documents": 5416593, + "num_queries": 1535, + "min_query_length": 26, + "average_query_length": 113.9114006514658, + "max_query_length": 381, + "unique_queries": 1535, + "none_queries": 0, + "num_relevant_docs": 4681, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 3.0495114006514656, + "max_relevant_docs_per_query": 5, + "unique_relevant_docs": 1344, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/DBPedia-Fa.json b/mteb/descriptive_stats/Retrieval/DBPedia-Fa.json new file mode 100644 index 0000000000..8285e7c948 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/DBPedia-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 4636322, + "number_of_characters": 1288938743, + "num_documents": 4635922, + "min_document_length": 0, + "average_document_length": 278.0301890325161, + "max_document_length": 5772, + "unique_documents": 4635922, + "num_queries": 400, + "min_query_length": 6, + "average_query_length": 31.1825, + "max_query_length": 90, + "unique_queries": 400, + "none_queries": 0, + "num_relevant_docs": 43515, + "min_relevant_docs_per_query": 21, + "average_relevant_docs_per_query": 38.215, + "max_relevant_docs_per_query": 1499, + "unique_relevant_docs": 40724, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/HotpotQA-Fa.json b/mteb/descriptive_stats/Retrieval/HotpotQA-Fa.json new file mode 100644 index 0000000000..c90930426e --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/HotpotQA-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 5240734, + "number_of_characters": 1357921635, + "num_documents": 5233329, + "min_document_length": 0, + "average_document_length": 259.36072144518334, + "max_document_length": 4687, + "unique_documents": 5233329, + "num_queries": 7405, + "min_query_length": 22, + "average_query_length": 81.2491559756921, + "max_query_length": 230, + "unique_queries": 7405, + "none_queries": 0, + "num_relevant_docs": 14810, + "min_relevant_docs_per_query": 2, + "average_relevant_docs_per_query": 2.0, + "max_relevant_docs_per_query": 2, + "unique_relevant_docs": 13783, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json b/mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json new file mode 100644 index 0000000000..95d640a138 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/MSMARCO-Fa.json @@ -0,0 +1,31 @@ +{ + "dev": { + "num_samples": 8848803, + "number_of_characters": 2707180637, + "num_documents": 8841823, + "min_document_length": 0, + "average_document_length": 306.1560844409575, + "max_document_length": 1617, + "unique_documents": 8841823, + "num_queries": 6980, + "min_query_length": 7, + "average_query_length": 29.044126074498568, + "max_query_length": 158, + "unique_queries": 6980, + "none_queries": 0, + "num_relevant_docs": 7437, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0654727793696275, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 7433, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/NQ-Fa.json b/mteb/descriptive_stats/Retrieval/NQ-Fa.json new file mode 100644 index 0000000000..fb84973c36 --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/NQ-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 2684920, + "number_of_characters": 1167597064, + "num_documents": 2681468, + "min_document_length": 3, + "average_document_length": 435.3785184085732, + "max_document_length": 5304, + "unique_documents": 2681468, + "num_queries": 3452, + "min_query_length": 15, + "average_query_length": 41.56981460023175, + "max_query_length": 102, + "unique_queries": 3452, + "none_queries": 0, + "num_relevant_docs": 4201, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.2169756662804172, + "max_relevant_docs_per_query": 4, + "unique_relevant_docs": 4201, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/mteb/descriptive_stats/Retrieval/Touche2020-Fa.json b/mteb/descriptive_stats/Retrieval/Touche2020-Fa.json new file mode 100644 index 0000000000..2234d28eee --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/Touche2020-Fa.json @@ -0,0 +1,31 @@ +{ + "test": { + "num_samples": 382594, + "number_of_characters": 512083705, + "num_documents": 382545, + "min_document_length": 3, + "average_document_length": 1338.6178750212393, + "max_document_length": 5929, + "unique_documents": 382545, + "num_queries": 49, + "min_query_length": 20, + "average_query_length": 43.46938775510204, + "max_query_length": 76, + "unique_queries": 49, + "none_queries": 0, + "num_relevant_docs": 2214, + "min_relevant_docs_per_query": 40, + "average_relevant_docs_per_query": 19.020408163265305, + "max_relevant_docs_per_query": 52, + "unique_relevant_docs": 2099, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "num_top_ranked": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } +} \ No newline at end of file diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 203dd00d79..897aab60b1 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -4,6 +4,7 @@ from pydantic import ValidationError from mteb.abstasks import AbsTask, TaskMetadata +from mteb.abstasks.aggregated_task import AbsTaskAggregate from mteb.overview import get_tasks # Historic datasets without filled metadata. Do NOT add new datasets to this list. @@ -519,7 +520,10 @@ def test_disallow_trust_remote_code_in_new_datasets(): @pytest.mark.parametrize("task", get_tasks()) def test_empty_descriptive_stat_in_new_datasets(task: AbsTask): - if task.metadata.name.startswith("Mock"): + if task.metadata.name.startswith("Mock") or isinstance(task, AbsTaskAggregate): + return + + if task.metadata.name in ["CodeRAGStackoverflowPosts"]: return assert ( From 7e8be0300c33b095458bef3fc21b6a7444e807c4 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 19:17:15 +0300 Subject: [PATCH 271/275] fix tests --- .../tasks/BitextMining/kat/TbilisiCityHallBitextMining.py | 6 +++--- .../Classification/multilingual/TurkicClassification.py | 8 +++----- mteb/tasks/PairClassification/multilingual/XStance.py | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py b/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py index c5bf62f5cf..3dda0cb5cb 100644 --- a/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py +++ b/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datasets import DatasetDict, load_dataset +import datasets from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining from mteb.abstasks.MultilingualTask import MultilingualTask @@ -50,7 +50,7 @@ def load_data(self, **kwargs) -> None: for lang in self.hf_subsets: l1, l2 = lang.split("-") - dataset = load_dataset( + dataset = datasets.load_dataset( self.metadata.dataset["path"], split=_EVAL_SPLIT, cache_dir=kwargs.get("cache_dir", None), @@ -59,6 +59,6 @@ def load_data(self, **kwargs) -> None: dataset = dataset.rename_columns( {_LANGUAGES[l1]: "sentence1", _LANGUAGES[l2]: "sentence2"} ) - self.dataset[lang] = DatasetDict({_EVAL_SPLIT: dataset}) + self.dataset[lang] = datasets.DatasetDict({_EVAL_SPLIT: dataset}) self.data_loaded = True diff --git a/mteb/tasks/Classification/multilingual/TurkicClassification.py b/mteb/tasks/Classification/multilingual/TurkicClassification.py index 3ab0684df1..3078ac20ac 100644 --- a/mteb/tasks/Classification/multilingual/TurkicClassification.py +++ b/mteb/tasks/Classification/multilingual/TurkicClassification.py @@ -43,14 +43,12 @@ class TurkicClassification(MultilingualTask, AbsTaskClassification): def transform_data(self, dataset, lang): dataset_lang = DatasetDict() label_count = Counter(dataset["train"]["label"]) - dataset_lang["train"] = dataset["train"].filter( + dataset_lang = dataset["train"].filter( lambda example: example["lang"] == lang and label_count[example["label"]] >= 20 ) - dataset_lang = self.stratified_subsampling( - dataset_lang, seed=self.seed, splits=["train"] - ) - return dataset_lang["train"] + dataset_lang = self.stratified_subsampling(dataset_lang, seed=self.seed) + return dataset_lang def load_data(self, **kwargs): """Load dataset from HuggingFace hub""" diff --git a/mteb/tasks/PairClassification/multilingual/XStance.py b/mteb/tasks/PairClassification/multilingual/XStance.py index 3c94ec4b60..81c0b9adbf 100644 --- a/mteb/tasks/PairClassification/multilingual/XStance.py +++ b/mteb/tasks/PairClassification/multilingual/XStance.py @@ -1,6 +1,6 @@ from __future__ import annotations -from datasets import load_dataset +import datasets from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -58,7 +58,7 @@ def load_data(self, **kwargs): self.dataset = {} path = self.metadata.dataset["path"] revision = self.metadata.dataset["revision"] - raw_dataset = load_dataset( + raw_dataset = datasets.load_dataset( path, revision=revision, trust_remote_code=self.metadata.dataset["trust_remote_code"], From 913781a09fcc78c7b3508ac88d21a5fdd0323814 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 2 Feb 2025 20:04:16 +0300 Subject: [PATCH 272/275] add todo for test --- tests/test_TaskMetadata.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 897aab60b1..a7ede62674 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -523,6 +523,7 @@ def test_empty_descriptive_stat_in_new_datasets(task: AbsTask): if task.metadata.name.startswith("Mock") or isinstance(task, AbsTaskAggregate): return + # TODO add descriptive_stat for CodeRAGStackoverflowPosts. Required > 128GB of RAM if task.metadata.name in ["CodeRAGStackoverflowPosts"]: return From 86d335895bf189c8776d3ba4d857aafefeb97ed5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:34:48 +0300 Subject: [PATCH 273/275] Revert "fix tests" This reverts commit 7e8be0300c33b095458bef3fc21b6a7444e807c4. --- .../tasks/BitextMining/kat/TbilisiCityHallBitextMining.py | 6 +++--- .../Classification/multilingual/TurkicClassification.py | 8 +++++--- mteb/tasks/PairClassification/multilingual/XStance.py | 4 ++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py b/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py index 3dda0cb5cb..c5bf62f5cf 100644 --- a/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py +++ b/mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py @@ -1,6 +1,6 @@ from __future__ import annotations -import datasets +from datasets import DatasetDict, load_dataset from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining from mteb.abstasks.MultilingualTask import MultilingualTask @@ -50,7 +50,7 @@ def load_data(self, **kwargs) -> None: for lang in self.hf_subsets: l1, l2 = lang.split("-") - dataset = datasets.load_dataset( + dataset = load_dataset( self.metadata.dataset["path"], split=_EVAL_SPLIT, cache_dir=kwargs.get("cache_dir", None), @@ -59,6 +59,6 @@ def load_data(self, **kwargs) -> None: dataset = dataset.rename_columns( {_LANGUAGES[l1]: "sentence1", _LANGUAGES[l2]: "sentence2"} ) - self.dataset[lang] = datasets.DatasetDict({_EVAL_SPLIT: dataset}) + self.dataset[lang] = DatasetDict({_EVAL_SPLIT: dataset}) self.data_loaded = True diff --git a/mteb/tasks/Classification/multilingual/TurkicClassification.py b/mteb/tasks/Classification/multilingual/TurkicClassification.py index 3078ac20ac..3ab0684df1 100644 --- a/mteb/tasks/Classification/multilingual/TurkicClassification.py +++ b/mteb/tasks/Classification/multilingual/TurkicClassification.py @@ -43,12 +43,14 @@ class TurkicClassification(MultilingualTask, AbsTaskClassification): def transform_data(self, dataset, lang): dataset_lang = DatasetDict() label_count = Counter(dataset["train"]["label"]) - dataset_lang = dataset["train"].filter( + dataset_lang["train"] = dataset["train"].filter( lambda example: example["lang"] == lang and label_count[example["label"]] >= 20 ) - dataset_lang = self.stratified_subsampling(dataset_lang, seed=self.seed) - return dataset_lang + dataset_lang = self.stratified_subsampling( + dataset_lang, seed=self.seed, splits=["train"] + ) + return dataset_lang["train"] def load_data(self, **kwargs): """Load dataset from HuggingFace hub""" diff --git a/mteb/tasks/PairClassification/multilingual/XStance.py b/mteb/tasks/PairClassification/multilingual/XStance.py index 81c0b9adbf..3c94ec4b60 100644 --- a/mteb/tasks/PairClassification/multilingual/XStance.py +++ b/mteb/tasks/PairClassification/multilingual/XStance.py @@ -1,6 +1,6 @@ from __future__ import annotations -import datasets +from datasets import load_dataset from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -58,7 +58,7 @@ def load_data(self, **kwargs): self.dataset = {} path = self.metadata.dataset["path"] revision = self.metadata.dataset["revision"] - raw_dataset = datasets.load_dataset( + raw_dataset = load_dataset( path, revision=revision, trust_remote_code=self.metadata.dataset["trust_remote_code"], From 8ef483eed50b90fbdf141a471bd9767ddc3f3cce Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Mon, 3 Feb 2025 10:35:44 +0300 Subject: [PATCH 274/275] add back check for multilingual --- tests/test_tasks/test_all_abstasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tasks/test_all_abstasks.py b/tests/test_tasks/test_all_abstasks.py index 9a5843bbfd..714da6ff68 100644 --- a/tests/test_tasks/test_all_abstasks.py +++ b/tests/test_tasks/test_all_abstasks.py @@ -8,7 +8,7 @@ import pytest import mteb -from mteb.abstasks import AbsTask +from mteb.abstasks import AbsTask, MultilingualTask from mteb.abstasks.AbsTaskReranking import AbsTaskReranking from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.AbsTaskSpeedTask import AbsTaskSpeedTask @@ -33,6 +33,7 @@ def test_load_data( isinstance(task, AbsTaskRetrieval) or isinstance(task, AbsTaskReranking) or isinstance(task, AbsTaskSpeedTask) + or isinstance(task, MultilingualTask) ): pytest.skip() with patch.object(task, "dataset_transform") as mock_dataset_transform: From 4e733bde8cd167b17aef8f1be5d6b818056b7154 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:38:59 +0300 Subject: [PATCH 275/275] fix imports --- mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py | 3 +-- .../multilingual/PubChemWikiPairClassification.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py index 4951d8c596..1ed5921464 100644 --- a/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py +++ b/mteb/tasks/BitextMining/eng/PubChemSMILESBitextMining.py @@ -1,7 +1,6 @@ from __future__ import annotations from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining -from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata COL_MAPPING = { @@ -19,7 +18,7 @@ } -class PubChemSMILESBitextMining(MultilingualTask, AbsTaskBitextMining): +class PubChemSMILESBitextMining(AbsTaskBitextMining): metadata = TaskMetadata( name="PubChemSMILESBitextMining", dataset={ diff --git a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py index 59a0605a82..e98ce4afcd 100644 --- a/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py +++ b/mteb/tasks/PairClassification/multilingual/PubChemWikiPairClassification.py @@ -1,7 +1,6 @@ from __future__ import annotations from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification -from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata _LANGUAGES = { @@ -20,7 +19,7 @@ } -class PubChemWikiPairClassification(AbsTaskPairClassification, MultilingualTask): +class PubChemWikiPairClassification(AbsTaskPairClassification): metadata = TaskMetadata( name="PubChemWikiPairClassification", dataset={